customGBEnergyN2.cc 16 KB
Newer Older
1
#ifdef SUPPORTS_64_BIT_ATOMICS
2
3
#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_deriv##INDEX[LOCAL_ID]*0x100000000)));
4
5
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
6
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[LOCAL_ID];
7
8
9
10
11
#endif

/**
 * Compute a force based on pair interactions.
 */
12
KERNEL void computeN2Energy(
13
#ifdef SUPPORTS_64_BIT_ATOMICS
14
        GLOBAL mm_ulong* RESTRICT forceBuffers,
15
#else
16
        GLOBAL real4* RESTRICT forceBuffers,
17
#endif
18
19
        GLOBAL mixed* RESTRICT energyBuffer,
        GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
20
        GLOBAL const int2* exclusionTiles, int needEnergy,
21
#ifdef USE_CUTOFF
22
23
24
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
25
26
27
28
#else
        unsigned int numTiles
#endif
        PARAMETER_ARGUMENTS) {
29
30
31
32
    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
    const unsigned int tbx = LOCAL_ID - tgx;
33
    mixed energy = 0;
34
    INIT_PARAM_DERIVS
35
36
37
    LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
    LOCAL real3 local_force[LOCAL_BUFFER_SIZE];
    ATOM_PARAMETER_DATA
38
39
40

    // First loop: process tiles that contain exclusions.
    
41
42
    const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
43
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
44
        const int2 tileIndices = exclusionTiles[pos];
45
46
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
47
        real3 force = make_real3(0);
48
49
        DECLARE_ATOM1_DERIVATIVES
        unsigned int atom1 = x*TILE_SIZE + tgx;
50
        real3 pos1 = trimTo3(posq[atom1]);
51
52
53
54
55
56
57
        LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif
        if (x == y) {
            // This tile is on the diagonal.

58
59
            const unsigned int localAtomIndex = LOCAL_ID;
            local_pos[localAtomIndex] = pos1;
60
61
62
63
            LOAD_LOCAL_PARAMETERS_FROM_1
            SYNC_WARPS;
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
64
65
                real3 pos2 = local_pos[atom2];
                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
66
#ifdef USE_PERIODIC
67
                APPLY_PERIODIC_TO_DELTA(delta)
68
69
70
71
72
73
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
#endif
                    real invR = RSQRT(r2);
peastman's avatar
peastman committed
74
                    real r = r2*invR;
75
76
77
78
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+j;
                    real dEdR = 0;
                    real tempEnergy = 0;
79
                    const real interactionScale = 0.5f;
80
81
82
83
84
85
86
#ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
#endif
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
                        COMPUTE_INTERACTION
                        dEdR /= -r;
                    }
87
88
                    if (needEnergy)
                        energy += 0.5f*tempEnergy;
89
90
91
92
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
93
94
95
96
97
98
99
100
101
102
103
104
#ifdef USE_CUTOFF
                }
#endif
#ifdef USE_EXCLUSIONS
                excl >>= 1;
#endif
                SYNC_WARPS;
            }
        }
        else {
            // This is an off-diagonal tile.

105
            const unsigned int localAtomIndex = LOCAL_ID;
106
            unsigned int j = y*TILE_SIZE + tgx;
107
            local_pos[localAtomIndex] = trimTo3(posq[j]);
108
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
109
            local_force[localAtomIndex] = make_real3(0);
110
111
112
113
114
115
116
117
            CLEAR_LOCAL_DERIVATIVES
            SYNC_WARPS;
#ifdef USE_EXCLUSIONS
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
118
119
                real3 pos2 = local_pos[atom2];
                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
120
#ifdef USE_PERIODIC
121
                APPLY_PERIODIC_TO_DELTA(delta)
122
123
124
125
126
127
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
#endif
                    real invR = RSQRT(r2);
peastman's avatar
peastman committed
128
                    real r = r2*invR;
129
130
131
132
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+tj;
                    real dEdR = 0;
                    real tempEnergy = 0;
133
                    const real interactionScale = 1;
134
135
136
137
138
139
140
#ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
#endif
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                        COMPUTE_INTERACTION
                        dEdR /= -r;
                    }
141
142
                    if (needEnergy)
                        energy += tempEnergy;
143
144
145
146
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
147
                    atom2 = tbx+tj;
148
                    local_force[atom2] += delta;
149
150
151
152
153
154
155
156
157
158
159
                    RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
                }
#endif
#ifdef USE_EXCLUSIONS
                excl >>= 1;
#endif
                tj = (tj + 1) & (TILE_SIZE - 1);
                SYNC_WARPS;
            }
        }
160

161
        // Write results.
162

163
164
#ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset = x*TILE_SIZE + tgx;
165
166
167
        ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
        ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
        ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
168
169
170
        STORE_DERIVATIVES_1
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
171
172
173
            ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].x*0x100000000)));
            ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].y*0x100000000)));
            ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].z*0x100000000)));
174
175
176
177
178
179
180
181
182
183
            STORE_DERIVATIVES_2
        }
#else
        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
        unsigned int offset = offset1;
        forceBuffers[offset1].xyz += force.xyz;
        STORE_DERIVATIVES_1
        if (x != y) {
            offset = offset2;
184
            forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
185
186
187
188
189
190
191
192
193
194
            STORE_DERIVATIVES_2
        }
#endif
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).

#ifdef USE_CUTOFF
    unsigned int numTiles = interactionCount[0];
195
196
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
197
198
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
199
#else
200
201
    int pos = (int) (warp*(mm_long)numTiles/totalWarps);
    int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
202
203
204
#endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
205
206
207
    LOCAL int atomIndices[LOCAL_BUFFER_SIZE];
    LOCAL volatile int skipTiles[LOCAL_BUFFER_SIZE];
    skipTiles[LOCAL_ID] = -1;
208
209
210

    while (pos < end) {
        const bool isExcluded = false;
211
        real3 force = make_real3(0);
212
213
214
215
216
        DECLARE_ATOM1_DERIVATIVES
        bool includeTile = true;

        // Extract the coordinates of this tile.
        
217
        int x, y;
218
219
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
220
221
222
223
224
225
226
227
228
229
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
230
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
231
        }
232

233
        // Skip over tiles that have exclusions, since they were already processed.
234

235
236
        SYNC_WARPS;
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
237
            SYNC_WARPS;
238
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
239
                int2 tile = exclusionTiles[skipBase+tgx];
240
                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
241
            }
242
            else
243
                skipTiles[LOCAL_ID] = end;
244
245
246
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
            SYNC_WARPS;
247
        }
248
249
250
251
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
252
253
254
255
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;

            // Load atom data for this tile.
256
257

            real3 pos1 = trimTo3(posq[atom1]);
258
            LOAD_ATOM1_PARAMETERS
259
            const unsigned int localAtomIndex = LOCAL_ID;
260
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
261
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
262
263
264
#else
            unsigned int j = y*TILE_SIZE + tgx;
#endif
265
            atomIndices[LOCAL_ID] = j;
266
            if (j < PADDED_NUM_ATOMS) {
267
                local_pos[localAtomIndex] = trimTo3(posq[j]);
268
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
269
                local_force[localAtomIndex] = make_real3(0);
270
271
272
273
274
275
276
277
278
                CLEAR_LOCAL_DERIVATIVES
            }
            SYNC_WARPS;
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
279
280
                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
                APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[LOCAL_ID], blockCenterX)
281
282
283
284
                SYNC_WARPS;
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
285
286
                    real3 pos2 = local_pos[atom2];
                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
287
288
289
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    if (r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
290
                        real r = r2*invR;
291
292
293
294
                        LOAD_ATOM2_PARAMETERS
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;
                        real tempEnergy = 0;
295
                        const real interactionScale = 1;
296
297
298
299
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                        }
300
301
                        if (needEnergy)
                            energy += tempEnergy;
302
303
304
305
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
306
                        atom2 = tbx+tj;
307
                        local_force[atom2] += delta;
308
309
310
311
312
313
314
315
316
317
318
319
320
321
                        RECORD_DERIVATIVE_2
                    }
                    tj = (tj + 1) & (TILE_SIZE - 1);
                    SYNC_WARPS;
                }
            }
            else
#endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.

                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
322
323
                    real3 pos2 = local_pos[atom2];
                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
324
#ifdef USE_PERIODIC
325
                    APPLY_PERIODIC_TO_DELTA(delta)
326
327
328
329
330
331
#endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
#endif
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
332
                        real r = r2*invR;
333
334
335
336
                        LOAD_ATOM2_PARAMETERS
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;
                        real tempEnergy = 0;
337
                        const real interactionScale = 1;
338
339
340
341
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                        }
342
343
                        if (needEnergy)
                            energy += tempEnergy;
344
345
346
347
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
348
                        atom2 = tbx+tj;
349
                        local_force[atom2] += delta;
350
351
352
353
354
355
356
357
358
359
                        RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
                    }
#endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
                    SYNC_WARPS;
                }
            }
        
            // Write results.
360

361
#ifdef USE_CUTOFF
362
            unsigned int atom2 = atomIndices[LOCAL_ID];
363
364
365
366
#else
            unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
367
368
369
            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long)  (force.y*0x100000000)));
            ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
370
371
372
            unsigned int offset = atom1;
            STORE_DERIVATIVES_1
            if (atom2 < PADDED_NUM_ATOMS) {
373
374
375
                ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].x*0x100000000)));
                ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].y*0x100000000)));
                ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].z*0x100000000)));
376
377
378
379
380
381
382
383
384
385
                offset = atom2;
                STORE_DERIVATIVES_2
            }
#else
            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
            forceBuffers[offset1].xyz += force.xyz;
            unsigned int offset = offset1;
            STORE_DERIVATIVES_1
            if (atom2 < PADDED_NUM_ATOMS) {
386
                forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
387
388
389
390
391
392
393
                offset = offset2;
                STORE_DERIVATIVES_2
            }
#endif
        }
        pos++;
    }
394
    energyBuffer[GLOBAL_ID] += energy;
395
    SAVE_PARAM_DERIVS
396
}