customGBEnergyN2_cpu.cl 17 KB
Newer Older
1
2
3
4
5
6
7
8
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[tgx]*0x100000000));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
#endif
9
10
11
12

/**
 * Compute a force based on pair interactions.
 */
13
14
15
__kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
        __global long* restrict forceBuffers,
16
#else
17
        __global real4* restrict forceBuffers,
18
#endif
19
        __global mixed* restrict energyBuffer, __local real4* restrict local_force,
20
        __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
Peter Eastman's avatar
Bug fix  
Peter Eastman committed
21
        __global const ushort2* exclusionTiles, int needEnergy,
22
#ifdef USE_CUTOFF
23
24
25
        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
26
#else
27
        unsigned int numTiles
28
#endif
29
        PARAMETER_ARGUMENTS) {
30
    mixed energy = 0;
31
    INIT_PARAM_DERIVS
32

33
34
35
36
37
38
39
40
    // First loop: process tiles that contain exclusions.
    
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
41

42
        // Load the data for this tile.
43

44
45
46
47
        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
            unsigned int j = y*TILE_SIZE + localAtomIndex;
            local_posq[localAtomIndex] = posq[j];
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
48
49
50
51
52
53
        }
        if (x == y) {
            // This tile is on the diagonal.

            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS
54
                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
55
56
#endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
57
                real4 force = 0;
58
                DECLARE_ATOM1_DERIVATIVES
59
                real4 posq1 = posq[atom1];
60
61
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
62
63
                    real4 posq2 = local_posq[j];
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
64
#ifdef USE_PERIODIC
65
                    APPLY_PERIODIC_TO_DELTA(delta)
66
#endif
67
                    real r2 = dot(delta.xyz, delta.xyz);
68
69
70
#ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
#endif
71
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
72
                        real r = r2*invR;
73
74
75
76
77
                        unsigned int atom2 = j;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = y*TILE_SIZE+j;
                        real dEdR = 0;
                        real tempEnergy = 0;
78
                        const real interactionScale = 0.5f;
79
80
81
82
83
84
85
86
87
88
#ifdef USE_EXCLUSIONS
                        bool isExcluded = !(excl & 0x1);
#endif
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                        }
                        energy += 0.5f*tempEnergy;
                        delta.xyz *= dEdR;
                        force.xyz -= delta.xyz;
89
90
91
92
93
94
95
96
#ifdef USE_CUTOFF
                    }
#endif
#ifdef USE_EXCLUSIONS
                    excl >>= 1;
#endif
                }

97
                // Write results.
98

99
100
101
102
103
#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = atom1;
                atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
104
                STORE_DERIVATIVES_1
105
106
107
108
109
#else
                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                forceBuffers[offset].xyz += force.xyz;
                STORE_DERIVATIVES_1
#endif
110
111
112
113
114
115
            }
        }
        else {
            // This is an off-diagonal tile.

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
116
                local_force[localAtomIndex] = 0;
117
118
                CLEAR_LOCAL_DERIVATIVES
            }
119
120
121
122
123
124
125
126
127
128
129
130
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS
                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real4 force = 0;
                DECLARE_ATOM1_DERIVATIVES
                real4 posq1 = posq[atom1];
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
                    real4 posq2 = local_posq[j];
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
131
#ifdef USE_PERIODIC
132
                    APPLY_PERIODIC_TO_DELTA(delta)
133
#endif
134
135
136
137
138
                    real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
#endif
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
139
                        real r = r2*invR;
140
141
142
143
144
                        unsigned int atom2 = j;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = y*TILE_SIZE+j;
                        real dEdR = 0;
                        real tempEnergy = 0;
145
                        const real interactionScale = 1.0f;
146
147
148
149
150
151
152
153
#ifdef USE_EXCLUSIONS
                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
                        if (!isExcluded) {
#else
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
                            COMPUTE_INTERACTION
                            dEdR /= -r;
154
                        }
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
                        energy += tempEnergy;
                        delta.xyz *= dEdR;
                        force.xyz -= delta.xyz;
                        atom2 = j;
                        local_force[atom2].xyz += delta.xyz;
                        RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
                    }
#endif
#ifdef USE_EXCLUSIONS
                    excl >>= 1;
#endif
                }

                // Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = atom1;
                atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
                STORE_DERIVATIVES_1
#else
                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                forceBuffers[offset].xyz += force.xyz;
                STORE_DERIVATIVES_1
#endif
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE+tgx;
                atom_add(&forceBuffers[offset], (long) (local_force[tgx].x*0x100000000));
                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
                STORE_DERIVATIVES_2
#else
                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
                forceBuffers[offset].xyz += local_force[tgx].xyz;
                STORE_DERIVATIVES_2
#endif
            }
        }
    }
201

202
203
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
204

205
206
#ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
207
208
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
209
210
    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
211
#else
212
213
    int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
    int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
214
215
216
217
218
219
220
221
222
223
224
#endif
    int nextToSkip = -1;
    int currentSkipIndex = 0;
    __local int atomIndices[TILE_SIZE];

    while (pos < end) {
        const bool isExcluded = false;
        bool includeTile = true;
        
        // Extract the coordinates of this tile.
        
225
        int x, y;
226
227
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
228
229
230
231
232
233
234
235
236
237
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
238
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
239
        }
240

241
        // Skip over tiles that have exclusions, since they were already processed.
242

243
244
245
246
        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
                ushort2 tile = exclusionTiles[currentSkipIndex++];
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
247
            }
248
249
            else
                nextToSkip = end;
250
        }
251
252
        includeTile = (nextToSkip != pos);
#endif
253
254
255
256
257
        if (includeTile) {
            // Load the data for this tile.

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
258
                unsigned int j = interactingAtoms[pos*TILE_SIZE+localAtomIndex];
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
#endif
                atomIndices[localAtomIndex] = j;
                if (j < PADDED_NUM_ATOMS) {
                    local_posq[localAtomIndex] = posq[j];
                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                    local_force[localAtomIndex] = 0;
                    CLEAR_LOCAL_DERIVATIVES
                }
            }
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
277
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[tgx], blockCenterX)
278
279
280
281
282
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real4 force = 0;
                    DECLARE_ATOM1_DERIVATIVES
                    real4 posq1 = posq[atom1];
283
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
284
285
286
287
288
289
290
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
                        real4 posq2 = local_posq[j];
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
                        real r2 = dot(delta.xyz, delta.xyz);
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
291
                            real r = r2*invR;
292
293
294
295
296
                            unsigned int atom2 = j;
                            LOAD_ATOM2_PARAMETERS
                            atom2 = atomIndices[j];
                            real dEdR = 0;
                            real tempEnergy = 0;
297
                            const real interactionScale = 1.0f;
298
299
300
301
302
303
304
305
306
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                            energy += tempEnergy;
                            delta.xyz *= dEdR;
                            force.xyz -= delta.xyz;
                            atom2 = j;
                            local_force[atom2].xyz += delta.xyz;
                            RECORD_DERIVATIVE_2
                        }
307
                    }
308
309
310
311
312
313
314
315
316
317
318
319
320
321

                    // Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset = atom1;
                    atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
                    atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
                    atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
                    STORE_DERIVATIVES_1
#else
                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                    forceBuffers[offset].xyz += force.xyz;
                    STORE_DERIVATIVES_1
#endif
322
323
324
325
326
                }
            }
            else
#endif
            {
327
                // We need to apply periodic boundary conditions separately for each interaction.
328
329
330

                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
331
                    real4 force = 0;
332
                    DECLARE_ATOM1_DERIVATIVES
333
                    real4 posq1 = posq[atom1];
334
335
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
336
337
                        real4 posq2 = local_posq[j];
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
338
#ifdef USE_PERIODIC
339
                        APPLY_PERIODIC_TO_DELTA(delta)
340
#endif
341
                        real r2 = dot(delta.xyz, delta.xyz);
342
#ifdef USE_CUTOFF
343
344
345
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS) {
346
#endif
347
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
348
                            real r = r2*invR;
349
350
351
352
353
                            unsigned int atom2 = j;
                            LOAD_ATOM2_PARAMETERS
                            atom2 = atomIndices[j];
                            real dEdR = 0;
                            real tempEnergy = 0;
354
                            const real interactionScale = 1.0f;
355
356
                            COMPUTE_INTERACTION
                            dEdR /= -r;
357
358
359
360
361
362
                            energy += tempEnergy;
                            delta.xyz *= dEdR;
                            force.xyz -= delta.xyz;
                            atom2 = j;
                            local_force[atom2].xyz += delta.xyz;
                            RECORD_DERIVATIVE_2
363
364
365
366
367
                        }
                    }

                    // Write results for atom1.

368
369
370
371
372
373
374
375
376
#ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset = atom1;
                    atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
                    atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
                    atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
                    STORE_DERIVATIVES_1
#else
                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                    forceBuffers[offset].xyz += force.xyz;
377
                    STORE_DERIVATIVES_1
378
#endif
379
380
381
                }
            }

382
            // Write results.
383
384

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#ifdef USE_CUTOFF
                unsigned int atom2 = atomIndices[tgx];
#else
                unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
                if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
                    atom_add(&forceBuffers[atom2], (long) (local_force[tgx].x*0x100000000));
                    atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
                    atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
                    unsigned int offset = atom2;
                    STORE_DERIVATIVES_2
#else
                    unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
                    forceBuffers[offset].xyz += local_force[tgx].xyz;
                    STORE_DERIVATIVES_2
#endif
                }
403
404
405
406
407
            }
        }
        pos++;
    }
    energyBuffer[get_global_id(0)] += energy;
408
    SAVE_PARAM_DERIVS
409
}