customGBValueN2_cpu.cl 14 KB
Newer Older
1
2
3
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
4
5
6
7

/**
 * Compute a value based on pair interactions.
 */
8
__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
9
10
11
12
13
14
15
        __global const ushort2* exclusionTiles,
#ifdef SUPPORTS_64_BIT_ATOMICS
        __global long* restrict global_value,
#else
        __global real* restrict global_value,
#endif
        __local real* restrict local_value,
16
#ifdef USE_CUTOFF
17
18
19
        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
20
21
22
23
#else
        unsigned int numTiles
#endif
        PARAMETER_ARGUMENTS) {
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

    // First loop: process tiles that contain exclusions.
    
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;

        // Load the data for this tile.

        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
            unsigned int j = y*TILE_SIZE + localAtomIndex;
            local_posq[localAtomIndex] = posq[j];
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
        }
        if (x == y) {
            // This tile is on the diagonal.

            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS
                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real value = 0;
                real4 posq1 = posq[atom1];
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
                    real4 posq2 = local_posq[j];
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
56
                    APPLY_PERIODIC_TO_DELTA(delta)
57
58
#endif
                    real r2 = dot(delta.xyz, delta.xyz);
59
#ifdef USE_CUTOFF
60
61
62
                    if (r2 < CUTOFF_SQUARED) {
#endif
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
63
                        real r = r2*invR;
64
65
66
67
68
69
70
71
                        unsigned int atom2 = j;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = y*TILE_SIZE+j;
                        real tempValue1 = 0;
                        real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
                        if (!isExcluded && atom1 != atom2) {
72
#else
73
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
74
#endif
75
76
77
                            COMPUTE_VALUE
                        }
                        value += tempValue1;
78
                        ADD_TEMP_DERIVS1
79
#ifdef USE_CUTOFF
80
                    }
81
82
#endif
#ifdef USE_EXCLUSIONS
83
                    excl >>= 1;
84
#endif
85
                }
86

87
                // Write results.
88

89
#ifdef SUPPORTS_64_BIT_ATOMICS
peastman's avatar
peastman committed
90
91
                unsigned int offset1 = atom1;
                atom_add(&global_value[offset1], (long) (value*0x100000000));
92
#else
peastman's avatar
peastman committed
93
94
                unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                global_value[offset1] += value;
95
#endif
peastman's avatar
peastman committed
96
                STORE_PARAM_DERIVS1
97
98
            }
        }
99
100
        else {
            // This is an off-diagonal tile.
101

102
103
            for (int tgx = 0; tgx < TILE_SIZE; tgx++)
                local_value[tgx] = 0;
104
105
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS
106
                unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
107
108
#endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
109
110
                real value = 0;
                real4 posq1 = posq[atom1];
111
112
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
113
114
                    real4 posq2 = local_posq[j];
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
115
#ifdef USE_PERIODIC
116
                    APPLY_PERIODIC_TO_DELTA(delta)
117
#endif
118
                    real r2 = dot(delta.xyz, delta.xyz);
119
120
121
#ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
#endif
122
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
123
                        real r = r2*invR;
124
125
126
127
128
                        unsigned int atom2 = j;
                        LOAD_ATOM2_PARAMETERS
                        atom2 = y*TILE_SIZE+j;
                        real tempValue1 = 0;
                        real tempValue2 = 0;
129
#ifdef USE_EXCLUSIONS
130
131
                        bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
                        if (!isExcluded) {
132
#else
133
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
134
#endif
135
136
137
138
                            COMPUTE_VALUE
                        }
                        value += tempValue1;
                        local_value[j] += tempValue2;
139
140
                        ADD_TEMP_DERIVS1
                        ADD_TEMP_DERIVS2
141
142
143
144
145
146
147
148
#ifdef USE_CUTOFF
                    }
#endif
#ifdef USE_EXCLUSIONS
                    excl >>= 1;
#endif
                }

149
                // Write results for atom1.
150

151
#ifdef SUPPORTS_64_BIT_ATOMICS
152
153
                unsigned int offset1 = atom1;
                atom_add(&global_value[offset1], (long) (value*0x100000000));
154
#else
155
156
                unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                global_value[offset1] += value;
157
#endif
158
                STORE_PARAM_DERIVS1
159
160
161
162
163
164
            }

            // Write results.

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
165
166
                unsigned int offset2 = y*TILE_SIZE+tgx;
                atom_add(&global_value[offset2], (long) (local_value[tgx]*0x100000000));
167
#else
168
169
                unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
                global_value[offset2] += local_value[tgx];
170
#endif
171
                STORE_PARAM_DERIVS2
172
173
            }
        }
174
    }
175

176
177
    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
178

179
180
#ifdef USE_CUTOFF
    const unsigned int numTiles = interactionCount[0];
181
182
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
183
184
    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
185
#else
186
187
    int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
    int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
188
189
190
191
192
193
194
195
196
197
#endif
    int nextToSkip = -1;
    int currentSkipIndex = 0;
    __local int atomIndices[TILE_SIZE];

    while (pos < end) {
        bool includeTile = true;
        
        // Extract the coordinates of this tile.
        
198
        int x, y;
199
200
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
201
202
203
204
205
206
207
208
209
210
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
211
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
212
        }
213

214
        // Skip over tiles that have exclusions, since they were already processed.
215

216
217
218
219
        while (nextToSkip < pos) {
            if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
                ushort2 tile = exclusionTiles[currentSkipIndex++];
                nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
220
            }
221
222
            else
                nextToSkip = end;
223
        }
224
225
        includeTile = (nextToSkip != pos);
#endif
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
        if (includeTile) {
            // Load the data for this tile.

            for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#ifdef USE_CUTOFF
                unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
#else
                unsigned int j = y*TILE_SIZE+localAtomIndex;
#endif
                atomIndices[localAtomIndex] = j;
                if (j < PADDED_NUM_ATOMS) {
                    local_posq[localAtomIndex] = posq[j];
                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                    local_value[localAtomIndex] = 0;
                }
            }
242
#ifdef USE_PERIODIC
243
244
245
246
247
248
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
249
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[tgx], blockCenterX)
250
251
252
253
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real value = 0;
                    real4 posq1 = posq[atom1];
254
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
255
256
257
258
259
260
261
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
                        real4 posq2 = local_posq[j];
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
                        real r2 = dot(delta.xyz, delta.xyz);
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
262
                            real r = r2*invR;
263
264
265
266
267
268
269
270
                            unsigned int atom2 = j;
                            LOAD_ATOM2_PARAMETERS
                            atom2 = atomIndices[j];
                            real tempValue1 = 0;
                            real tempValue2 = 0;
                            COMPUTE_VALUE
                            value += tempValue1;
                            local_value[j] += tempValue2;
271
272
                            ADD_TEMP_DERIVS1
                            ADD_TEMP_DERIVS2
273
                        }
274
                    }
275

276
                    // Write results for atom1.
277

278
#ifdef SUPPORTS_64_BIT_ATOMICS
peastman's avatar
peastman committed
279
280
                    unsigned int offset1 = atom1;
                    atom_add(&global_value[offset1], (long) (value*0x100000000));
281
#else
peastman's avatar
peastman committed
282
283
                    unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                    global_value[offset1] += value;
284
#endif
peastman's avatar
peastman committed
285
                    STORE_PARAM_DERIVS1
286
287
288
289
290
                }
            }
            else
#endif
            {
291
                // We need to apply periodic boundary conditions separately for each interaction.
292
293
294

                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
295
296
                    real value = 0;
                    real4 posq1 = posq[atom1];
297
298
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
299
300
                        real4 posq2 = local_posq[j];
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
301
#ifdef USE_PERIODIC
302
                        APPLY_PERIODIC_TO_DELTA(delta)
303
#endif
304
                        real r2 = dot(delta.xyz, delta.xyz);
305
#ifdef USE_CUTOFF
306
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
307
#else
308
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS) {
309
#endif
310
                            real invR = RSQRT(r2);
peastman's avatar
peastman committed
311
                            real r = r2*invR;
312
313
314
315
316
                            unsigned int atom2 = j;
                            LOAD_ATOM2_PARAMETERS
                            atom2 = atomIndices[j];
                            real tempValue1 = 0;
                            real tempValue2 = 0;
317
                            COMPUTE_VALUE
318
319
                            value += tempValue1;
                            local_value[j] += tempValue2;
320
321
                            ADD_TEMP_DERIVS1
                            ADD_TEMP_DERIVS2
322
323
324
325
326
                        }
                    }

                    // Write results for atom1.

327
#ifdef SUPPORTS_64_BIT_ATOMICS
328
329
                    unsigned int offset1 = atom1;
                    atom_add(&global_value[offset1], (long) (value*0x100000000));
330
#else
331
332
                    unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                    global_value[offset1] += value;
333
#endif
334
                    STORE_PARAM_DERIVS1
335
336
337
                }
            }

338
            // Write results.
339
340

            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
341
342
343
344
345
346
347
#ifdef USE_CUTOFF
                unsigned int atom2 = atomIndices[tgx];
#else
                unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
                if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
348
349
                    unsigned int offset2 = atom2;
                    atom_add(&global_value[offset2], (long) (local_value[tgx]*0x100000000));
350
#else
351
352
                    unsigned int offset2 = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
                    global_value[offset2] += local_value[tgx];
353
#endif
354
                    STORE_PARAM_DERIVS2
355
                }
356
357
358
359
360
            }
        }
        pos++;
    }
}