lcpo.cc 18 KB
Newer Older
Evan Pretti's avatar
Evan Pretti committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
typedef struct {
    real4 data12;
    real4 data21;
} NeighborData;

/**
 * Load the position of an atom from global memory.
 */
inline DEVICE real3 loadCondensedPos(GLOBAL const real* RESTRICT condensedPos, int atom) {
    int offset = 3 * atom;
    return make_real3(condensedPos[offset], condensedPos[offset + 1], condensedPos[offset + 2]);
}

/**
 * Record the force on an atom to global memory.
 */
inline DEVICE void storeForce(int atom, real3 force, GLOBAL mm_ulong* RESTRICT forceBuffers) {
    ATOMIC_ADD(&forceBuffers[atom], (mm_ulong) realToFixedPoint(force.x));
    ATOMIC_ADD(&forceBuffers[atom + PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
    ATOMIC_ADD(&forceBuffers[atom + 2 * PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
}

/**
 * Compute the difference between two vectors, taking periodic boundary
 * conditions into account and setting the fourth component to the squared
 * magnitude.
 */
inline DEVICE real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {

    real4 result = make_real4(vec1.x - vec2.x, vec1.y - vec2.y, vec1.z - vec2.z, 0);
#ifdef USE_PERIODIC
    APPLY_PERIODIC_TO_DELTA(result)
#endif
    result.w = result.x * result.x + result.y * result.y + result.z * result.z;
    return result;
}

/**
 * Copies positions of active particles to a separate array for performance.
 */
KERNEL void condensePos(GLOBAL const int* RESTRICT activeParticles, GLOBAL const real4* RESTRICT posq, GLOBAL real* RESTRICT condensedPos) {
    for (int i = GLOBAL_ID; i < NUM_ACTIVE; i += GLOBAL_SIZE) {
        real3 pos = trimTo3(posq[activeParticles[i]]);
        int iOffset = 3 * i;
        condensedPos[iOffset] = pos.x;
        condensedPos[iOffset + 1] = pos.y;
        condensedPos[iOffset + 2] = pos.z;
    }
}

/**
 * Find a bounding box for the atoms in each block.
 */
KERNEL void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        GLOBAL const real* RESTRICT condensedPos, GLOBAL real4* RESTRICT blockCenter,
        GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT numNeighborPairs) {

    // Each thread (index) processes one block of WARP_SIZE atoms (from base
    // through min(base + WARP_SIZE, NUM_ACTIVE) - 1) at a time.

    int index = GLOBAL_ID;
    int base = index * WARP_SIZE;

    while (base < NUM_ACTIVE) {
        real3 pos3 = loadCondensedPos(condensedPos, base);
        real4 pos = make_real4(pos3.x, pos3.y, pos3.z, 0);
#ifdef USE_PERIODIC
        APPLY_PERIODIC_TO_POS(pos)
#endif
        real4 minPos = pos;
        real4 maxPos = pos;
        int last = min(base + WARP_SIZE, NUM_ACTIVE);
        for (int i = base + 1; i < last; i++) {
            pos3 = loadCondensedPos(condensedPos, i);
            pos = make_real4(pos3.x, pos3.y, pos3.z, 0);
#ifdef USE_PERIODIC
            real4 center = 0.5f * (maxPos + minPos);
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif
            minPos = make_real4(min(minPos.x, pos.x), min(minPos.y, pos.y), min(minPos.z, pos.z), 0);
            maxPos = make_real4(max(maxPos.x, pos.x), max(maxPos.y, pos.y), max(maxPos.z, pos.z), 0);
        }

        real4 blockSize = 0.5f * (maxPos - minPos);
        blockBoundingBox[index] = blockSize;
        blockCenter[index] = 0.5f * (maxPos + minPos);

        // All threads can process GLOBAL_SIZE blocks together; if there are
        // more blocks, advance to the next block for this thread.

        index += GLOBAL_SIZE;
        base = index * WARP_SIZE;
    }

    // Reset numNeighborPairs for the findNeighbors kernel.

    if (GLOBAL_ID == 0) {
        *numNeighborPairs = 0;
    }
}

/**
 * Find a list of neighbors for each atom.
 */
KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        GLOBAL const real* RESTRICT condensedPos, GLOBAL const real4* RESTRICT parameters,
        GLOBAL const real4* RESTRICT blockCenter, GLOBAL const real4* RESTRICT blockBoundingBox,
        GLOBAL int* RESTRICT numNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom,
        GLOBAL int2* RESTRICT neighborPairs, real cutoffSquared, int maxNeighborPairs) {

    // parameters: each real4 holds [radius, p2, p3, p4].
    // posrCache:  each real4 holds [x, y, z, radius].

    LOCAL real4 posrCache[FIND_NEIGHBORS_THREAD_BLOCK_SIZE];
    int indexInWarp = LOCAL_ID % WARP_SIZE;
    int warpStart = LOCAL_ID - indexInWarp;
#if !(defined(__CUDA_ARCH__) || defined(USE_HIP))
    LOCAL bool includeBlockFlags[FIND_NEIGHBORS_THREAD_BLOCK_SIZE];
#endif

    // Each thread will search for the neighbors of one atom at a time.  Note
    // that all threads in a warp are processing atoms from the same block.

    for (int active1 = GLOBAL_ID; active1 < PADDED_NUM_ACTIVE; active1 += GLOBAL_SIZE) {

        int numNeighborsForAtom1 = 0;

        real3 pos1 = loadCondensedPos(condensedPos, active1);
        real radius1 = parameters[active1].x;

        int block1 = active1 / WARP_SIZE;
        real4 blockCenter1 = blockCenter[block1];
        real4 blockSize1 = blockBoundingBox[block1];

        // Loop over atom blocks to search for neighbors.  The threads in a warp
        // compare block1 against 32 other blocks in parallel.

        for (int block2Base = block1; block2Base < NUM_BLOCKS; block2Base += WARP_SIZE) {
            int block2 = block2Base + indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
            if (includeBlock2) {
                real4 blockCenter2 = blockCenter[block2];
                real4 blockSize2 = blockBoundingBox[block2];
                real4 blockDelta = blockCenter1 - blockCenter2;
#ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
                blockDelta.x = max((real) 0, fabs(blockDelta.x) - blockSize1.x - blockSize2.x);
                blockDelta.y = max((real) 0, fabs(blockDelta.y) - blockSize1.y - blockSize2.y);
                blockDelta.z = max((real) 0, fabs(blockDelta.z) - blockSize1.z - blockSize2.z);
                includeBlock2 &= (blockDelta.x * blockDelta.x + blockDelta.y * blockDelta.y + blockDelta.z * blockDelta.z < cutoffSquared);
            }

            // Each thread now loops over any blocks we identified as
            // potentially containing neighbors of atoms in block1, and checks
            // to see if they are actually neighbors of atom1.

#if defined(__CUDA_ARCH__) || defined(USE_HIP)
            int includeBlockFlags = BALLOT(includeBlock2);
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags) - 1;
                includeBlockFlags &= includeBlockFlags - 1;
                {
#else
            SYNC_WARPS;
            includeBlockFlags[LOCAL_ID] = includeBlock2;
            SYNC_WARPS;
            for (int i = 0; i < WARP_SIZE; i++) {
                if (includeBlockFlags[warpStart + i]) {
#endif
                    int block2 = block2Base + i;
                    int start = block2 * WARP_SIZE;
                    int active = start + indexInWarp;

                    int included[WARP_SIZE];
                    int numIncluded = 0;

                    // All threads in a warp will compare atoms with atoms in
                    // block2, so load their parameters into shared memory.

                    SYNC_WARPS;
                    real3 pos = loadCondensedPos(condensedPos, active);
                    posrCache[LOCAL_ID] = make_real4(pos.x, pos.y, pos.z, parameters[active].x);
                    SYNC_WARPS;

                    if (active1 < NUM_ACTIVE) {
                        for (int j = 0; j < WARP_SIZE; j++) {
                            int active2 = start + j;
                            real4 posr2 = posrCache[warpStart + j];
                            real3 pos2 = trimTo3(posr2);
                            real radius2 = posr2.w;

                            // Decide whether to include this atom pair in the neighbor list.

                            real4 delta12 = delta(pos2, pos1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
                            real pairCutoff = radius1 + radius2;
                            if (active2 > active1 && active2 < NUM_ACTIVE && delta12.w < pairCutoff * pairCutoff) {
                                int includedIndex = numIncluded++;
                                included[includedIndex] = active2;
                            }
                        }
                    }

                    // Store any neighbors of this atom that were found.

                    if (numIncluded) {
                        int baseIndex = ATOMIC_ADD(numNeighborPairs, numIncluded);
                        if (baseIndex + numIncluded <= maxNeighborPairs) {
                            for (int j = 0; j < numIncluded; j++) {
                                neighborPairs[baseIndex + j] = make_int2(active1, included[j]);
                            }
                        }
                        numNeighborsForAtom1 += numIncluded;
                    }
                }
            }
        }

        numNeighborsForAtom[active1] = numNeighborsForAtom1;
        SYNC_WARPS;
    }
}

/**
 * Sum the neighbor counts to compute the start position of each atom.  This
 * kernel is executed as a single thread block group.
 */
KERNEL void computeNeighborStartIndices(GLOBAL int* RESTRICT numNeighborPairsPointer,
        GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL int* RESTRICT neighborStartIndex,
        int maxNeighborPairs) {

    LOCAL unsigned int posBuffer[THREAD_BLOCK_SIZE];

    if (*numNeighborPairsPointer > maxNeighborPairs) {
        return; // There wasn't enough memory for the neighbor list.
    }

    unsigned int globalOffset = 0;
    for (unsigned int startAtom = 0; startAtom < NUM_ACTIVE; startAtom += LOCAL_SIZE) {
        // Load the neighbor counts into local memory.

        unsigned int globalIndex = startAtom + LOCAL_ID;
        posBuffer[LOCAL_ID] = (globalIndex < NUM_ACTIVE ? numNeighborsForAtom[globalIndex] : 0);
        SYNC_THREADS;

        // Perform a parallel prefix sum.

        for (unsigned int step = 1; step < LOCAL_SIZE; step *= 2) {
            unsigned int add = (LOCAL_ID >= step ? posBuffer[LOCAL_ID - step] : 0);
            SYNC_THREADS;
            posBuffer[LOCAL_ID] += add;
            SYNC_THREADS;
        }

        // Write the results back to global memory.

        if (globalIndex < NUM_ACTIVE) {
            neighborStartIndex[globalIndex + 1] = posBuffer[LOCAL_ID] + globalOffset;
            // Clear this so the next kernel can use it as a counter.
            numNeighborsForAtom[globalIndex] = 0;
        }
        globalOffset += posBuffer[LOCAL_SIZE - 1];
        SYNC_THREADS;
    }

    if (LOCAL_ID == 0) {
        neighborStartIndex[0] = 0;
    }
}

/**
 * Assemble the final neighbor list.
 */
KERNEL void copyPairsToNeighborList(real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        GLOBAL const real* RESTRICT condensedPos, GLOBAL const real4* RESTRICT parameters,
        GLOBAL int* RESTRICT numNeighborPairsPointer, GLOBAL int* RESTRICT numNeighborsForAtom,
        GLOBAL const int* RESTRICT neighborStartIndex, GLOBAL const int2* RESTRICT neighborPairs,
        GLOBAL int2* RESTRICT neighbors, GLOBAL NeighborData* RESTRICT neighborData, int maxNeighborPairs) {

    int numNeighborPairs = *numNeighborPairsPointer;
    if (numNeighborPairs > maxNeighborPairs) {
        return; // There wasn't enough memory for the neighbor list.
    }

    for (unsigned int pairIndex = GLOBAL_ID; pairIndex < numNeighborPairs; pairIndex += GLOBAL_SIZE) {
        int2 pair = neighborPairs[pairIndex];

        int offset = ATOMIC_ADD(numNeighborsForAtom + pair.x, 1);
        int storeIndex = neighborStartIndex[pair.x] + offset;

        // Store the original index so we can get back to (i, j) pairs.

        neighbors[storeIndex] = make_int2(pair.y, pairIndex);

        // Precompute data for this pair to be looked up later.

        real3 iPos = loadCondensedPos(condensedPos, pair.x);
        real3 jPos = loadCondensedPos(condensedPos, pair.y);
        real4 ijDelta = delta(jPos, iPos, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
        real iRadius = parameters[pair.x].x;
        real jRadius = parameters[pair.y].x;
        real iRadiusPi = PI * iRadius;
        real jRadiusPi = PI * jRadius;
        real r = SQRT(ijDelta.w);
        real rRecip = (real) 1 / r;
        real deltaRadiusR = (iRadius * iRadius - jRadius * jRadius) * rRecip;
        real deltaRadiusRSq = deltaRadiusR * rRecip;
        real3 forceDir = trimTo3(ijDelta) * rRecip;
        real3 iForceDir = iRadiusPi * ((real) -1 + deltaRadiusRSq) * forceDir;
        real3 jForceDir = jRadiusPi * ((real) -1 - deltaRadiusRSq) * forceDir;

        neighborData[storeIndex].data12 = make_real4(iForceDir.x, iForceDir.y, iForceDir.z, iRadiusPi * ((real) 2 * iRadius - r - deltaRadiusR));
        neighborData[storeIndex].data21 = make_real4(jForceDir.x, jForceDir.y, jForceDir.z, jRadiusPi * ((real) 2 * jRadius - r + deltaRadiusR));
    }
}

/**
 * Compute the LCPO interaction.
 */
KERNEL void computeInteraction(
        real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        GLOBAL const real* RESTRICT condensedPos, GLOBAL mm_ulong* RESTRICT forceBuffers,
        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const int* RESTRICT activeParticles,
        GLOBAL const real4* RESTRICT parameters, GLOBAL int* RESTRICT numNeighborPairsPointer,
        GLOBAL const int* RESTRICT neighborStartIndex, GLOBAL const int2* RESTRICT neighborPairs,
        GLOBAL int2* RESTRICT neighbors, GLOBAL NeighborData* RESTRICT neighborData, int maxNeighborPairs) {

    int numNeighborPairs = *numNeighborPairsPointer;
    if (numNeighborPairs > maxNeighborPairs) {
        return; // There wasn't enough memory for the neighbor list.
    }

    mixed energy = 0;
    for (unsigned int pairIndex = GLOBAL_ID; pairIndex < numNeighborPairs; pairIndex += GLOBAL_SIZE) {
        int2 pair = neighbors[pairIndex];
        int i = neighborPairs[pair.y].x;
        int j = pair.x;
        real3 iPos = loadCondensedPos(condensedPos, i);
        real4 iParams = parameters[i];
        real4 jParams = parameters[j];
        real3 iForce = make_real3(0);
        real3 jForce = make_real3(0);
        NeighborData ijData = neighborData[pairIndex];
        real4 Aij = ijData.data12;
        real4 Aji = ijData.data21;

        real4 ijForce2 = iParams.y * Aij + jParams.y * Aji;
        real3 ijForce2_3 = trimTo3(ijForce2);
        iForce += ijForce2_3;
        jForce -= ijForce2_3;
        energy += ijForce2.w;

        real iRadius = iParams.x;
        real iRadiusPi = PI * iRadius;

        int jStart = neighborStartIndex[j];
        int jEnd = neighborStartIndex[j + 1];
        for (int jNeighbor = jStart; jNeighbor < jEnd; jNeighbor++) {
            int k = neighbors[jNeighbor].x;

            // It is faster to recompute the i-k interaction distance and
            // parameters to decide whether or not to include k than to try to
            // look up its index in the neighbor list of i.

            real3 kPos = loadCondensedPos(condensedPos, k);
            real4 kParams = parameters[k];
            real kRadius = kParams.x;
            real kRadiusPi = PI * kRadius;

            real4 ikDelta = delta(kPos, iPos, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
            real pairCutoff = iRadius + kRadius;
            if (ikDelta.w >= pairCutoff * pairCutoff) {
                continue;
            }

            real r = SQRT(ikDelta.w);
            real rRecip = (real) 1 / r;
            real deltaRadiusR = (iRadius * iRadius - kRadius * kRadius) * rRecip;
            real deltaRadiusRSq = deltaRadiusR * rRecip;
            real3 forceDir = trimTo3(ikDelta) * rRecip;
            real3 iForceDir = iRadiusPi * ((real) -1 + deltaRadiusRSq) * forceDir;
            real3 jForceDir = kRadiusPi * ((real) -1 - deltaRadiusRSq) * forceDir;

            real4 Aik = make_real4(iForceDir.x, iForceDir.y, iForceDir.z, iRadiusPi * ((real) 2 * iRadius - r - deltaRadiusR));
            real4 Aki = make_real4(jForceDir.x, jForceDir.y, jForceDir.z, kRadiusPi * ((real) 2 * kRadius - r + deltaRadiusR));

            NeighborData jkData = neighborData[jNeighbor];
            real4 Ajk = jkData.data12;
            real4 Akj = jkData.data21;

            real4 ijkForce34 = (iParams.z + iParams.w * Aij.w) * Ajk + (iParams.z + iParams.w * Aik.w) * Akj;
            real4 jikForce34 = (jParams.z + jParams.w * Aji.w) * Aik + (jParams.z + jParams.w * Ajk.w) * Aki;
            real4 kijForce34 = (kParams.z + kParams.w * Aki.w) * Aij + (kParams.z + kParams.w * Akj.w) * Aji;
            real3 ijkForce34_3 = trimTo3(ijkForce34);
            real3 jikForce34_3 = trimTo3(jikForce34);
            real3 kijForce34_3 = trimTo3(kijForce34);

            real3 ijkForce4 = iParams.w * trimTo3(Aij) * Ajk.w;
            real3 ikjForce4 = iParams.w * trimTo3(Aik) * Akj.w;
            real3 jikForce4 = jParams.w * trimTo3(Aji) * Aik.w;
            real3 jkiForce4 = jParams.w * trimTo3(Ajk) * Aki.w;
            real3 kijForce4 = kParams.w * trimTo3(Aki) * Aij.w;
            real3 kjiForce4 = kParams.w * trimTo3(Akj) * Aji.w;

            energy += ijkForce34.w + jikForce34.w + kijForce34.w;
            iForce += jikForce34_3 + kijForce34_3 + ijkForce4 + ikjForce4 + jikForce4 + kijForce4;
            jForce += ijkForce34_3 - kijForce34_3 + jkiForce4 - jikForce4 - ijkForce4 + kjiForce4;
            real3 kForce = -(ijkForce34_3 + jikForce34_3 + kijForce4 + kjiForce4 + ikjForce4 + jkiForce4);

            storeForce(activeParticles[k], kForce, forceBuffers);
        }

        storeForce(activeParticles[i], iForce, forceBuffers);
        storeForce(activeParticles[j], jForce, forceBuffers);
    }
    energyBuffer[GLOBAL_ID] += energy;
}