customGBEnergyN2_cpu.cc 17.7 KB
Newer Older
1
#ifdef SUPPORTS_64_BIT_ATOMICS
2
3
#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_deriv##INDEX[tgx]*0x100000000)));
4
5
6
7
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
#endif
8
9
10
11

/**
 * Compute a force based on pair interactions.
 */
12
KERNEL void computeN2Energy(
13
#ifdef SUPPORTS_64_BIT_ATOMICS
14
        GLOBAL mm_ulong* RESTRICT forceBuffers,
15
#else
16
        GLOBAL real4* RESTRICT forceBuffers,
17
#endif
18
19
20
        GLOBAL mixed* RESTRICT energyBuffer,
        GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
        GLOBAL const ushort2* exclusionTiles, int needEnergy,
21
#ifdef USE_CUTOFF
22
23
24
        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
25
#else
26
        unsigned int numTiles
27
#endif
28
        PARAMETER_ARGUMENTS) {
29
    mixed energy = 0;
30
    INIT_PARAM_DERIVS
31
32
33
    LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
    LOCAL real3 local_force[LOCAL_BUFFER_SIZE];
    ATOM_PARAMETER_DATA
34

35
36
    // First loop: process tiles that contain exclusions.
    
37
38
    const int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    const int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
39
40
41
42
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
43

44
        // Load the data for this tile.
45

46
47
        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
            unsigned int j = y*TILE_SIZE + localAtomIndex;
48
            local_pos[localAtomIndex] = trimTo3(posq[j]);
49
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
50
51
52
53
54
55
        }
        if (x == y) {
            // This tile is on the diagonal.

            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS
56
                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
57
58
#endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
59
                real4 force = 0;
60
                DECLARE_ATOM1_DERIVATIVES
61
                real3 pos1 = trimTo3(posq[atom1]);
62
63
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
64
65
                    real3 pos2 = local_pos[j];
                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
66
#ifdef USE_PERIODIC
67
                    APPLY_PERIODIC_TO_DELTA(delta)
68
#endif
69
                    real r2 = dot(delta, delta);
70
71
72
#ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
#endif
73
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
74
                        real r = r2*invR;
75
76
77
78
79
                        unsigned int atom2 = j;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = y*TILE_SIZE+j;
                        real dEdR = 0;
                        real tempEnergy = 0;
80
                        const real interactionScale = 0.5f;
81
82
83
84
85
86
87
88
#ifdef USE_EXCLUSIONS
                        bool isExcluded = !(excl & 0x1);
#endif
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                        }
                        energy += 0.5f*tempEnergy;
89
90
91
92
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
93
94
95
96
97
98
99
100
#ifdef USE_CUTOFF
                    }
#endif
#ifdef USE_EXCLUSIONS
                    excl >>= 1;
#endif
                }

101
                // Write results.
102

103
104
#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = atom1;
105
106
107
                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
108
                STORE_DERIVATIVES_1
109
#else
110
                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
111
112
113
                forceBuffers[offset].xyz += force.xyz;
                STORE_DERIVATIVES_1
#endif
114
115
116
117
118
119
            }
        }
        else {
            // This is an off-diagonal tile.

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
120
                local_force[localAtomIndex] = 0;
121
122
                CLEAR_LOCAL_DERIVATIVES
            }
123
124
125
126
127
128
129
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS
                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real4 force = 0;
                DECLARE_ATOM1_DERIVATIVES
130
                real3 pos1 = trimTo3(posq[atom1]);
131
132
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
133
134
                    real3 pos2 = local_pos[j];
                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
135
#ifdef USE_PERIODIC
136
                    APPLY_PERIODIC_TO_DELTA(delta)
137
#endif
138
139
140
141
142
                    real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
#endif
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
143
                        real r = r2*invR;
144
145
146
147
148
                        unsigned int atom2 = j;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = y*TILE_SIZE+j;
                        real dEdR = 0;
                        real tempEnergy = 0;
149
                        const real interactionScale = 1.0f;
150
151
152
153
154
155
156
157
#ifdef USE_EXCLUSIONS
                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
                        if (!isExcluded) {
#else
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
                            COMPUTE_INTERACTION
                            dEdR /= -r;
158
                        }
159
                        energy += tempEnergy;
160
161
162
163
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
                        atom2 = j;
                        local_force[atom2].xyz += delta.xyz;
                        RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
                    }
#endif
#ifdef USE_EXCLUSIONS
                    excl >>= 1;
#endif
                }

                // Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = atom1;
179
180
181
                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
182
183
                STORE_DERIVATIVES_1
#else
184
                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
185
186
187
188
189
190
191
192
193
194
                forceBuffers[offset].xyz += force.xyz;
                STORE_DERIVATIVES_1
#endif
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE+tgx;
195
196
197
                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (local_force[tgx].x*0x100000000)));
                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].y*0x100000000)));
                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].z*0x100000000)));
198
199
                STORE_DERIVATIVES_2
#else
200
                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
201
202
203
204
205
206
                forceBuffers[offset].xyz += local_force[tgx].xyz;
                STORE_DERIVATIVES_2
#endif
            }
        }
    }
207

208
209
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
210

211
212
#ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
213
214
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
215
216
    int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
    int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
217
#else
218
219
    int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
    int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
220
221
222
#endif
    int nextToSkip = -1;
    int currentSkipIndex = 0;
223
    LOCAL int atomIndices[TILE_SIZE];
224
225
226
227
228
229
230

    while (pos < end) {
        const bool isExcluded = false;
        bool includeTile = true;
        
        // Extract the coordinates of this tile.
        
231
        int x, y;
232
233
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
234
235
236
237
238
239
240
241
242
243
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
244
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
245
        }
246

247
        // Skip over tiles that have exclusions, since they were already processed.
248

249
250
251
252
        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
                ushort2 tile = exclusionTiles[currentSkipIndex++];
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
253
            }
254
255
            else
                nextToSkip = end;
256
        }
257
258
        includeTile = (nextToSkip != pos);
#endif
259
260
261
262
263
        if (includeTile) {
            // Load the data for this tile.

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
264
                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
265
266
267
268
269
#else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
#endif
                atomIndices[localAtomIndex] = j;
                if (j < PADDED_NUM_ATOMS) {
270
                    local_pos[localAtomIndex] = trimTo3(posq[j]);
271
272
273
274
275
276
277
278
279
280
281
282
                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                    local_force[localAtomIndex] = 0;
                    CLEAR_LOCAL_DERIVATIVES
                }
            }
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
283
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[tgx], blockCenterX)
284
285
286
287
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real4 force = 0;
                    DECLARE_ATOM1_DERIVATIVES
288
289
                    real3 pos1 = trimTo3(posq[atom1]);
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
290
291
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
292
293
                        real3 pos2 = local_pos[j];
                        real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
294
295
296
                        real r2 = dot(delta.xyz, delta.xyz);
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
297
                            real r = r2*invR;
298
299
300
301
302
                            unsigned int atom2 = j;
                            LOAD_ATOM2_PARAMETERS
                            atom2 = atomIndices[j];
                            real dEdR = 0;
                            real tempEnergy = 0;
303
                            const real interactionScale = 1.0f;
304
305
306
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                            energy += tempEnergy;
307
308
309
310
                            delta *= dEdR;
                            force.x -= delta.x;
                            force.y -= delta.y;
                            force.z -= delta.z;
311
312
313
314
                            atom2 = j;
                            local_force[atom2].xyz += delta.xyz;
                            RECORD_DERIVATIVE_2
                        }
315
                    }
316
317
318
319
320

                    // Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset = atom1;
321
322
323
                    ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
                    ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
                    ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
324
325
                    STORE_DERIVATIVES_1
#else
326
                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
327
328
329
                    forceBuffers[offset].xyz += force.xyz;
                    STORE_DERIVATIVES_1
#endif
330
331
332
333
334
                }
            }
            else
#endif
            {
335
                // We need to apply periodic boundary conditions separately for each interaction.
336
337
338

                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
339
                    real4 force = 0;
340
                    DECLARE_ATOM1_DERIVATIVES
341
                    real3 pos1 = trimTo3(posq[atom1]);
342
343
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
344
345
                        real3 pos2 = local_pos[j];
                        real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
346
#ifdef USE_PERIODIC
347
                        APPLY_PERIODIC_TO_DELTA(delta)
348
#endif
349
                        real r2 = dot(delta.xyz, delta.xyz);
350
#ifdef USE_CUTOFF
351
352
353
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS) {
354
#endif
355
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
356
                            real r = r2*invR;
357
358
359
360
361
                            unsigned int atom2 = j;
                            LOAD_ATOM2_PARAMETERS
                            atom2 = atomIndices[j];
                            real dEdR = 0;
                            real tempEnergy = 0;
362
                            const real interactionScale = 1.0f;
363
364
                            COMPUTE_INTERACTION
                            dEdR /= -r;
365
                            energy += tempEnergy;
366
367
368
369
                            delta *= dEdR;
                            force.x -= delta.x;
                            force.y -= delta.y;
                            force.z -= delta.z;
370
                            atom2 = j;
371
                            local_force[atom2] += delta;
372
                            RECORD_DERIVATIVE_2
373
374
375
376
377
                        }
                    }

                    // Write results for atom1.

378
379
#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset = atom1;
380
381
382
                    ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
                    ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
                    ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
383
384
                    STORE_DERIVATIVES_1
#else
385
                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
386
                    forceBuffers[offset].xyz += force.xyz;
387
                    STORE_DERIVATIVES_1
388
#endif
389
390
391
                }
            }

392
            // Write results.
393
394

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
395
396
397
398
399
400
401
#ifdef USE_CUTOFF
                unsigned int atom2 = atomIndices[tgx];
#else
                unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
                if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
402
403
404
                    ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (local_force[tgx].x*0x100000000)));
                    ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].y*0x100000000)));
                    ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].z*0x100000000)));
405
406
407
                    unsigned int offset = atom2;
                    STORE_DERIVATIVES_2
#else
408
                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
409
410
411
412
                    forceBuffers[offset].xyz += local_force[tgx].xyz;
                    STORE_DERIVATIVES_2
#endif
                }
413
414
415
416
            }
        }
        pos++;
    }
417
    energyBuffer[GLOBAL_ID] += energy;
418
    SAVE_PARAM_DERIVS
419
}