customGBEnergyN2.cu 16.3 KB
Newer Older
1
2
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
3
4

typedef struct {
5
    real3 pos;
6
    real3 force;
7
8
9
10
11
12
13
14
15
16
    ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
    float padding;
#endif
} AtomData;

/**
 * Compute a force based on pair interactions.
 */
extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
17
        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
18
#ifdef USE_CUTOFF
19
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
20
        unsigned int maxTiles, const real4* __restrict__ blockCenter, const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
21
22
23
24
#else
        unsigned int numTiles
#endif
        PARAMETER_ARGUMENTS) {
25
26
27
28
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
29
30
    real energy = 0;
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
31
32

    // First loop: process tiles that contain exclusions.
33
    
34
35
36
37
38
39
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
40
        real3 force = make_real3(0);
41
        DECLARE_ATOM1_DERIVATIVES
42
        unsigned int atom1 = x*TILE_SIZE + tgx;
43
        real4 pos1 = posq[atom1];
44
45
46
        LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
47
#endif
48
49
        if (x == y) {
            // This tile is on the diagonal.
50

51
            const unsigned int localAtomIndex = threadIdx.x;
52
            localData[localAtomIndex].pos = make_real3(pos1.x, pos1.y, pos1.z);
53
54
55
            LOAD_LOCAL_PARAMETERS_FROM_1
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
56
57
                real3 pos2 = localData[atom2].pos;
                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
58
59
60
61
62
63
64
65
66
67
#ifdef USE_PERIODIC
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
#endif
                    real invR = RSQRT(r2);
peastman's avatar
peastman committed
68
                    real r = r2*invR;
69
70
71
72
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+j;
                    real dEdR = 0;
                    real tempEnergy = 0;
73
#ifdef USE_EXCLUSIONS
74
75
76
77
78
79
80
81
82
83
84
85
86
                    bool isExcluded = !(excl & 0x1);
#endif
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
                        COMPUTE_INTERACTION
                        dEdR /= -r;
                    }
                    energy += 0.5f*tempEnergy;
                    delta *= dEdR;
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
#ifdef USE_CUTOFF
                }
87
88
#endif
#ifdef USE_EXCLUSIONS
89
                excl >>= 1;
90
#endif
91
92
93
94
95
96
97
            }
        }
        else {
            // This is an off-diagonal tile.

            const unsigned int localAtomIndex = threadIdx.x;
            unsigned int j = y*TILE_SIZE + tgx;
98
99
            real4 tempPosq = posq[j];
            localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
100
101
102
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            localData[localAtomIndex].force = make_real3(0);
            CLEAR_LOCAL_DERIVATIVES
103
#ifdef USE_EXCLUSIONS
104
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
105
#endif
106
107
108
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
109
110
                real3 pos2 = localData[atom2].pos;
                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
111
#ifdef USE_PERIODIC
112
113
114
                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
115
#endif
116
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
117
#ifdef USE_CUTOFF
118
                if (r2 < CUTOFF_SQUARED) {
119
120
#endif
                    real invR = RSQRT(r2);
peastman's avatar
peastman committed
121
                    real r = r2*invR;
122
                    LOAD_ATOM2_PARAMETERS
123
                    atom2 = y*TILE_SIZE+tj;
124
125
                    real dEdR = 0;
                    real tempEnergy = 0;
126
127
128
129
#ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
#endif
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
130
131
132
                        COMPUTE_INTERACTION
                        dEdR /= -r;
                    }
133
                    energy += tempEnergy;
134
                    delta *= dEdR;
135
136
137
                    force.x -= delta.x;
                    force.y -= delta.y;
                    force.z -= delta.z;
138
139
140
141
142
                    atom2 = tbx+tj;
                    localData[atom2].force.x += delta.x;
                    localData[atom2].force.y += delta.y;
                    localData[atom2].force.z += delta.z;
                    RECORD_DERIVATIVE_2
143
#ifdef USE_CUTOFF
144
                }
145
146
#endif
#ifdef USE_EXCLUSIONS
147
                excl >>= 1;
148
#endif
149
                tj = (tj + 1) & (TILE_SIZE - 1);
150
            }
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
        }

        // Write results.

        unsigned int offset = x*TILE_SIZE + tgx;
        atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
        atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
        atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
        STORE_DERIVATIVES_1
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
            STORE_DERIVATIVES_2
        }
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).

172
#ifdef USE_CUTOFF
173
    unsigned int numTiles = interactionCount[0];
174
175
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
176
#else
177
178
    int pos = (int) (warp*(long long)numTiles/totalWarps);
    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
179
180
181
182
#endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
183
    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
184
185
186
187
188
189
190
191
192
193
    skipTiles[threadIdx.x] = -1;
    
    while (pos < end) {
        const bool isExcluded = false;
        real3 force = make_real3(0);
        DECLARE_ATOM1_DERIVATIVES
        bool includeTile = true;
        
        // Extract the coordinates of this tile.
        
194
        int x, y;
195
196
197
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
        if (numTiles <= maxTiles) {
198
            x = tiles[pos];
199
200
201
202
            real4 blockSizeX = blockSize[x];
            singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                                  0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                                  0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
203
204
205
206
        }
        else
#endif
        {
207
            y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
208
209
210
211
212
213
214
215
216
217
218
219
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
                y += (x < y ? -1 : 1);
                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            }

            // Skip over tiles that have exclusions, since they were already processed.

            while (skipTiles[tbx+TILE_SIZE-1] < pos) {
                if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                    ushort2 tile = exclusionTiles[skipBase+tgx];
                    skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
220
221
                }
                else
222
223
224
225
226
227
228
229
230
231
232
233
234
                    skipTiles[threadIdx.x] = end;
                skipBase += TILE_SIZE;            
                currentSkipIndex = tbx;
            }
            while (skipTiles[currentSkipIndex] < pos)
                currentSkipIndex++;
            includeTile = (skipTiles[currentSkipIndex] != pos);
        }
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;

            // Load atom data for this tile.

235
            real4 pos1 = posq[atom1];
236
237
238
239
240
241
            LOAD_ATOM1_PARAMETERS
            const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
            unsigned int j = y*TILE_SIZE + tgx;
242
#endif
243
244
            atomIndices[threadIdx.x] = j;
            if (j < PADDED_NUM_ATOMS) {
245
246
                real4 tempPosq = posq[j];
                localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
247
248
249
250
251
252
253
254
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                localData[localAtomIndex].force = make_real3(0);
                CLEAR_LOCAL_DERIVATIVES
            }
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.
255

256
                real4 blockCenterX = blockCenter[x];
257
258
259
260
261
262
                pos1.x -= floor((pos1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                pos1.y -= floor((pos1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                pos1.z -= floor((pos1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
                localData[threadIdx.x].pos.x -= floor((localData[threadIdx.x].pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                localData[threadIdx.x].pos.y -= floor((localData[threadIdx.x].pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                localData[threadIdx.x].pos.z -= floor((localData[threadIdx.x].pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
263
264
265
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
266
267
                    real3 pos2 = localData[atom2].pos;
                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
268
269
270
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
271
#endif
272
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
273
                        real r = r2*invR;
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
                        LOAD_ATOM2_PARAMETERS
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;
                        real tempEnergy = 0;
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                        }
                        energy += tempEnergy;
                        delta *= dEdR;
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
                        atom2 = tbx+tj;
                        localData[atom2].force.x += delta.x;
                        localData[atom2].force.y += delta.y;
                        localData[atom2].force.z += delta.z;
                        RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
                    }
294
#endif
295
296
297
298
299
300
301
302
303
304
305
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
            }
            else
#endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.

                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
306
307
                    real3 pos2 = localData[atom2].pos;
                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
308
#ifdef USE_PERIODIC
309
310
311
                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
312
#endif
313
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
314
#ifdef USE_CUTOFF
315
                    if (r2 < CUTOFF_SQUARED) {
316
317
#endif
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
318
                        real r = r2*invR;
319
                        LOAD_ATOM2_PARAMETERS
320
                        atom2 = atomIndices[tbx+tj];
321
322
323
324
325
326
327
328
                        real dEdR = 0;
                        real tempEnergy = 0;
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                        }
                        energy += tempEnergy;
                        delta *= dEdR;
329
330
331
                        force.x -= delta.x;
                        force.y -= delta.y;
                        force.z -= delta.z;
332
                        atom2 = tbx+tj;
333
334
335
                        localData[atom2].force.x += delta.x;
                        localData[atom2].force.y += delta.y;
                        localData[atom2].force.z += delta.z;
336
337
338
                        RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
                    }
339
340
#endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
341
342
343
                }
            }
        
344
345
346
347
348
349
            // Write results.

            atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
            atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
            atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
            unsigned int offset = atom1;
350
            STORE_DERIVATIVES_1
351
352
353
354
355
356
357
358
359
360
361
362
#ifdef USE_CUTOFF
            unsigned int atom2 = atomIndices[threadIdx.x];
#else
            unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
            if (atom2 < PADDED_NUM_ATOMS) {
                atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
                atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
                atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
                offset = atom2;
                STORE_DERIVATIVES_2
            }
363
364
        }
        pos++;
365
    }
366
367
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}