customGBValueN2.cu 12.3 KB
Newer Older
1
typedef struct {
2
3
    real3 pos;
    real value;
4
5
6
7
8
9
10
11
12
13
    ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
    float padding;
#endif
} AtomData;

/**
 * Compute a value based on pair interactions.
 */
extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
14
        const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ global_value,
15
#ifdef USE_CUTOFF
16
17
18
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
19
20
21
22
#else
        unsigned int numTiles
#endif
        PARAMETER_ARGUMENTS) {
23
24
25
26
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
27
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
28
29

    // First loop: process tiles that contain exclusions.
30
    
31
32
33
34
35
36
    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
37
        real value = 0;
38
        unsigned int atom1 = x*TILE_SIZE + tgx;
39
        real4 pos1 = posq[atom1];
40
41
42
        LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
43
#endif
44
45
        if (x == y) {
            // This tile is on the diagonal.
46

47
            const unsigned int localAtomIndex = threadIdx.x;
48
            localData[localAtomIndex].pos = make_real3(pos1.x, pos1.y, pos1.z);
49
50
51
            LOAD_LOCAL_PARAMETERS_FROM_1
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
52
53
                real3 pos2 = localData[atom2].pos;
                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
54
#ifdef USE_PERIODIC
55
                APPLY_PERIODIC_TO_DELTA(delta)
56
57
58
59
60
61
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
                if (r2 < CUTOFF_SQUARED) {
#endif
                    real invR = RSQRT(r2);
peastman's avatar
peastman committed
62
                    real r = r2*invR;
63
64
65
66
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y*TILE_SIZE+j;
                    real tempValue1 = 0;
                    real tempValue2 = 0;
67
#ifdef USE_EXCLUSIONS
68
69
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
                    if (!isExcluded && atom1 != atom2) {
70
#else
71
72
73
74
75
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
                        COMPUTE_VALUE
                    }
                    value += tempValue1;
76
                    ADD_TEMP_DERIVS1
77
78
#ifdef USE_CUTOFF
                }
79
80
#endif
#ifdef USE_EXCLUSIONS
81
                excl >>= 1;
82
#endif
83
84
85
86
87
88
89
            }
        }
        else {
            // This is an off-diagonal tile.

            const unsigned int localAtomIndex = threadIdx.x;
            unsigned int j = y*TILE_SIZE + tgx;
90
91
            real4 tempPosq = posq[j];
            localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
92
93
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            localData[localAtomIndex].value = 0;
94
#ifdef USE_EXCLUSIONS
95
            excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
96
#endif
97
98
99
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
100
101
                real3 pos2 = localData[atom2].pos;
                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
102
#ifdef USE_PERIODIC
103
                APPLY_PERIODIC_TO_DELTA(delta)
104
#endif
105
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
106
#ifdef USE_CUTOFF
107
                if (r2 < CUTOFF_SQUARED) {
108
109
#endif
                    real invR = RSQRT(r2);
peastman's avatar
peastman committed
110
                    real r = r2*invR;
111
                    LOAD_ATOM2_PARAMETERS
112
                    atom2 = y*TILE_SIZE+tj;
113
114
115
                    real tempValue1 = 0;
                    real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
116
117
                    bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
                    if (!isExcluded) {
118
#else
119
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
120
121
122
123
#endif
                        COMPUTE_VALUE
                    }
                    value += tempValue1;
124
                    localData[tbx+tj].value += tempValue2;
125
126
                    ADD_TEMP_DERIVS1
                    ADD_TEMP_DERIVS2
127
#ifdef USE_CUTOFF
128
                }
129
130
#endif
#ifdef USE_EXCLUSIONS
131
                excl >>= 1;
132
#endif
133
                tj = (tj + 1) & (TILE_SIZE - 1);
134
            }
135
136
137
138
        }

        // Write results.

139
140
141
        unsigned int offset1 = x*TILE_SIZE + tgx;
        atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
        STORE_PARAM_DERIVS1
142
        if (x != y) {
143
144
145
            unsigned int offset2 = y*TILE_SIZE + tgx;
            atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
            STORE_PARAM_DERIVS2
146
147
148
149
150
        }
    }

    // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
    // of them (no cutoff).
151
152

#ifdef USE_CUTOFF
153
    unsigned int numTiles = interactionCount[0];
154
155
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
156
157
    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
158
#else
159
160
    int pos = (int) (warp*(long long)numTiles/totalWarps);
    int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
161
162
163
164
#endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
    __shared__ int atomIndices[THREAD_BLOCK_SIZE];
165
    __shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
166
167
168
169
170
171
172
173
    skipTiles[threadIdx.x] = -1;
    
    while (pos < end) {
        real value = 0;
        bool includeTile = true;
        
        // Extract the coordinates of this tile.
        
174
        int x, y;
175
176
        bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
177
178
179
180
181
182
183
184
185
186
        x = tiles[pos];
        real4 blockSizeX = blockSize[x];
        singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
                              0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
                              0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
        y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
        x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
        if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
            y += (x < y ? -1 : 1);
187
            x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
188
        }
189

190
        // Skip over tiles that have exclusions, since they were already processed.
191

192
193
194
195
        while (skipTiles[tbx+TILE_SIZE-1] < pos) {
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                ushort2 tile = exclusionTiles[skipBase+tgx];
                skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
196
            }
197
198
199
200
            else
                skipTiles[threadIdx.x] = end;
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
201
        }
202
203
204
205
        while (skipTiles[currentSkipIndex] < pos)
            currentSkipIndex++;
        includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
206
207
208
209
210
        if (includeTile) {
            unsigned int atom1 = x*TILE_SIZE + tgx;

            // Load atom data for this tile.
            
211
            real4 pos1 = posq[atom1];
212
213
214
215
216
217
218
219
220
            LOAD_ATOM1_PARAMETERS
            const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
            unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
            unsigned int j = y*TILE_SIZE + tgx;
#endif
            atomIndices[threadIdx.x] = j;
            if (j < PADDED_NUM_ATOMS) {
221
222
                real4 tempPosq = posq[j];
                localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
223
224
225
226
227
228
229
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                localData[localAtomIndex].value = 0;
            }
#ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic
                // box, then skip having to apply periodic boundary conditions later.
230

231
                real4 blockCenterX = blockCenter[x];
232
233
                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x].pos, blockCenterX)
234
235
236
                unsigned int tj = tgx;
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
237
238
                    real3 pos2 = localData[atom2].pos;
                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
239
240
241
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    if (r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
242
                        real r = r2*invR;
243
244
245
246
247
248
                        LOAD_ATOM2_PARAMETERS
                        atom2 = atomIndices[tbx+tj];
                        real tempValue1 = 0;
                        real tempValue2 = 0;
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_VALUE
249
                        }
250
251
                        value += tempValue1;
                        localData[tbx+tj].value += tempValue2;
252
253
                        ADD_TEMP_DERIVS1
                        ADD_TEMP_DERIVS2
254
                    }
255
                    tj = (tj + 1) & (TILE_SIZE - 1);
256
                }
257
258
            }
            else
259
#endif
260
261
            {
                // We need to apply periodic boundary conditions separately for each interaction.
262

263
264
265
                unsigned int tj = tgx;
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
266
267
                    real3 pos2 = localData[atom2].pos;
                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
268
#ifdef USE_PERIODIC
269
                    APPLY_PERIODIC_TO_DELTA(delta)
270
#endif
271
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
272
#ifdef USE_CUTOFF
273
                    if (r2 < CUTOFF_SQUARED) {
274
275
#endif
                        real invR = RSQRT(r2);
peastman's avatar
peastman committed
276
                        real r = r2*invR;
277
                        LOAD_ATOM2_PARAMETERS
278
                        atom2 = atomIndices[tbx+tj];
279
280
281
282
283
284
285
                        real tempValue1 = 0;
                        real tempValue2 = 0;
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_VALUE
                        }
                        value += tempValue1;
                        localData[tbx+tj].value += tempValue2;
286
287
                        ADD_TEMP_DERIVS1
                        ADD_TEMP_DERIVS2
288
289
#ifdef USE_CUTOFF
                    }
290
291
#endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
292
293
294
                }
            }
        
295
296
            // Write results.

297
298
299
            unsigned int offset1 = atom1;
            atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
            STORE_PARAM_DERIVS1
300
301
302
303
304
#ifdef USE_CUTOFF
            unsigned int atom2 = atomIndices[threadIdx.x];
#else
            unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
305
306
307
308
309
            if (atom2 < PADDED_NUM_ATOMS) {
                unsigned int offset2 = atom2;
                atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
                STORE_PARAM_DERIVS2
            }
310
311
        }
        pos++;
312
    }
313
}