customNonbondedGroups.cl 9.39 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif

typedef struct {
    real x, y, z;
    real q;
    real fx, fy, fz;
    ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
    real padding;
#endif
} AtomData;

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/**
 * Find the maximum of a value across all threads in a warp, and return that to
 * every thread.
 */
int reduceMax(int val, __local int* temp) {
    int indexInWarp = get_local_id(0)%32;
    temp[get_local_id(0)] = val;
    SYNC_WARPS;
    for (int offset = 16; offset > 0; offset /= 2) {
        if (offset < indexInWarp)
            temp[get_local_id(0)] = max(temp[get_local_id(0)], temp[get_local_id(0)+offset]);
        SYNC_WARPS;
    }
    return temp[get_local_id(0)-indexInWarp];
}

31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/**
 * This function is used on devices that don't support 64 bit atomics.  Multiple threads within
 * a single tile might have computed forces on the same atom.  This loops over them and makes sure
 * that only one thread updates the force on any given atom.
 */
void writeForces(__global real4* forceBuffers,__local AtomData* localData, int atomIndex) {
    localData[get_local_id(0)].x = atomIndex;
    SYNC_WARPS;
    real4 forceSum = (real4) 0;
    int start = (get_local_id(0)/TILE_SIZE)*TILE_SIZE;
    int end = start+32;
    bool isFirst = true;
    for (int i = start; i < end; i++)
        if (localData[i].x == atomIndex) {
            forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
            isFirst &= (i >= get_local_id(0));
        }
    const unsigned int warp = get_global_id(0)/TILE_SIZE;
    unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
    if (isFirst)
        forceBuffers[offset] += forceSum;
    SYNC_WARPS;
}

55
__kernel void computeInteractionGroups(
56
57
58
59
60
#ifdef SUPPORTS_64_BIT_ATOMICS
        __global long* restrict forceBuffers,
#else
        __global real4* restrict forceBuffers,
#endif
61
        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData,
62
        __global int* restrict numGroupTiles, int useNeighborList,
63
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
64
65
66
67
68
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
    const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = get_local_id(0) - tgx;           // block warpIndex
69
    mixed energy = 0;
70
    INIT_DERIVATIVES
71
    __local AtomData localData[LOCAL_MEMORY_SIZE];
72
    __local int reductionBuffer[LOCAL_MEMORY_SIZE];
73

74
75
    const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
    const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
    for (int tile = startTile; tile < endTile; tile++) {
        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
        const int atom1 = atomData.x;
        const int atom2 = atomData.y;
        const int rangeStart = atomData.z&0xFFFF;
        const int rangeEnd = (atomData.z>>16)&0xFFFF;
        const int exclusions = atomData.w;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
        real4 force = (real4) (0);
        real4 posq2 = posq[atom2];
        localData[get_local_id(0)].x = posq2.x;
        localData[get_local_id(0)].y = posq2.y;
        localData[get_local_id(0)].z = posq2.z;
        localData[get_local_id(0)].q = posq2.w;
        LOAD_LOCAL_PARAMETERS
        localData[get_local_id(0)].fx = 0.0f;
        localData[get_local_id(0)].fy = 0.0f;
        localData[get_local_id(0)].fz = 0.0f;
        int tj = tgx;
96
        int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart, reductionBuffer);
97
        SYNC_WARPS;
98
99
        for (int j = rangeStart; j < rangeStop; j++) {
            if (j < rangeEnd) {
peastman's avatar
peastman committed
100
101
102
103
                bool isExcluded = (((exclusions>>tj)&1) == 0);
                int localIndex = tbx+tj;
                posq2 = (real4) (localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
104
#ifdef USE_PERIODIC
105
                APPLY_PERIODIC_TO_DELTA(delta)
106
#endif
peastman's avatar
peastman committed
107
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
108
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
109
                if (!isExcluded && r2 < CUTOFF_SQUARED) {
110
#endif
peastman's avatar
peastman committed
111
                    real invR = RSQRT(r2);
peastman's avatar
peastman committed
112
                    real r = r2*invR;
peastman's avatar
peastman committed
113
114
115
                    LOAD_ATOM2_PARAMETERS
                    real dEdR = 0.0f;
                    real tempEnergy = 0.0f;
116
                    const real interactionScale = 1.0f;
peastman's avatar
peastman committed
117
118
119
120
121
122
123
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
                    delta *= dEdR;
                    force.xyz -= delta.xyz;
                    localData[localIndex].fx += delta.x;
                    localData[localIndex].fy += delta.y;
                    localData[localIndex].fz += delta.z;
124
#ifdef USE_CUTOFF
peastman's avatar
peastman committed
125
                }
126
#endif
peastman's avatar
peastman committed
127
            }
128
129
130
            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
            SYNC_WARPS;
        }
131
#ifdef SUPPORTS_64_BIT_ATOMICS
132
133
134
135
136
        if (exclusions != 0) {
            atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
            atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
            atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
        }
peastman's avatar
peastman committed
137
138
139
        atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000));
        atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
        atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
140
141
142
143
144
145
146
#else
        writeForces(forceBuffers, localData, atom2);
        localData[get_local_id(0)].fx = force.x;
        localData[get_local_id(0)].fy = force.y;
        localData[get_local_id(0)].fz = force.z;
        writeForces(forceBuffers, localData, atom1);
#endif
147
148
    }
    energyBuffer[get_global_id(0)] += energy;
149
    SAVE_DERIVATIVES
150
}
151
152
153
154
155
156
157
158
159
160
161
162
163
164

/**
 * If the neighbor list needs to be rebuilt, reset the number of tiles to 0.  This is
 * executed by a single thread.
 */
__kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles) {
    if (rebuildNeighborList[0] == 1)
        numGroupTiles[0] = 0;
}

/**
 * Filter the list of tiles to include only ones that have interactions within the
 * padded cutoff.
 */
peastman's avatar
peastman committed
165
__kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles,
166
167
168
169
170
171
172
173
174
175
176
177
178
179
        __global const real4* restrict posq, __global const int4* restrict groupData, __global int4* restrict filteredGroupData,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    
    // If the neighbor list doesn't need to be rebuilt on this step, return immediately.
    
    if (rebuildNeighborList[0] == 0)
        return;

    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
    const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex
    const unsigned int local_warp = get_local_id(0)/TILE_SIZE; // local warpIndex
    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = get_local_id(0) - tgx;           // block warpIndex
    __local real4 localPos[LOCAL_MEMORY_SIZE];
180
181
    __local volatile bool anyInteraction[WARPS_IN_BLOCK];
    __local volatile int tileIndex[WARPS_IN_BLOCK];
182
    __local int reductionBuffer[LOCAL_MEMORY_SIZE];
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197

    const unsigned int startTile = warp*NUM_TILES/totalWarps;
    const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
    for (int tile = startTile; tile < endTile; tile++) {
        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
        const int atom1 = atomData.x;
        const int atom2 = atomData.y;
        const int rangeStart = atomData.z&0xFFFF;
        const int rangeEnd = (atomData.z>>16)&0xFFFF;
        const int exclusions = atomData.w;
        real4 posq1 = posq[atom1];
        localPos[get_local_id(0)] = posq[atom2];
        if (tgx == 0)
            anyInteraction[local_warp] = false;
        int tj = tgx;
198
        int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart, reductionBuffer);
199
        SYNC_WARPS;
200
        for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) {
peastman's avatar
peastman committed
201
            SYNC_WARPS;
202
            if (j < rangeEnd) {
203
204
205
206
207
208
209
210
211
212
213
214
215
216
                bool isExcluded = (((exclusions>>tj)&1) == 0);
                int localIndex = tbx+tj;
                real4 delta = (real4) (localPos[localIndex].xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
#endif
                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                if (!isExcluded && r2 < PADDED_CUTOFF_SQUARED)
                    anyInteraction[local_warp] = true;
            }
            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
            SYNC_WARPS;
        }
        if (anyInteraction[local_warp]) {
peastman's avatar
peastman committed
217
            SYNC_WARPS;
218
            if (tgx == 0)
peastman's avatar
peastman committed
219
                tileIndex[local_warp] = atomic_add(numGroupTiles, 1);
220
221
222
223
224
            SYNC_WARPS;
            filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
        }
    }
}