virtualSites.cl 13 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/**
 * Load the position of a particle.
 */
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
    real4 pos1 = posq[index];
    real4 pos2 = posqCorrection[index];
    return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
    return posq[index];
#endif
}

/**
 * Store the position of a particle.
 */
void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION
    posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
    posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
    posq[index] = pos;
#endif
}

26
27
28
/**
 * Compute the positions of virtual sites
 */
29
30
31
32
33
34
__kernel void computeVirtualSites(__global real4* restrict posq,
#ifdef USE_MIXED_PRECISION
        __global real4* restrict posqCorrection,
#endif
        __global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights,
        __global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
35
        __global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights,
36
37
38
        __global const int* restrict localCoordsIndex, __global const int* restrict localCoordsAtoms,
        __global const real* restrict localCoordsWeights, __global const real4* restrict localCoordsPos,
        __global const int* restrict localCoordsStartIndex) {
39
40
41
#ifndef USE_MIXED_PRECISION
        __global real4* posqCorrection = 0;
#endif
42
43
44
45
46
    
    // Two particle average sites.
    
    for (int index = get_global_id(0); index < NUM_2_AVERAGE; index += get_global_size(0)) {
        int4 atoms = avg2Atoms[index];
47
48
49
50
        real2 weights = avg2Weights[index];
        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
        mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
        mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
51
        pos.xyz = pos1.xyz*weights.x + pos2.xyz*weights.y;
52
        storePos(posq, posqCorrection, atoms.x, pos);
53
54
55
56
57
58
    }
    
    // Three particle average sites.
    
    for (int index = get_global_id(0); index < NUM_3_AVERAGE; index += get_global_size(0)) {
        int4 atoms = avg3Atoms[index];
59
60
61
62
63
        real4 weights = avg3Weights[index];
        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
        mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
        mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
        mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
64
        pos.xyz = pos1.xyz*weights.x + pos2.xyz*weights.y + pos3.xyz*weights.z;
65
        storePos(posq, posqCorrection, atoms.x, pos);
66
67
68
69
70
71
    }
    
    // Out of plane sites.
    
    for (int index = get_global_id(0); index < NUM_OUT_OF_PLANE; index += get_global_size(0)) {
        int4 atoms = outOfPlaneAtoms[index];
72
73
74
75
76
77
78
        real4 weights = outOfPlaneWeights[index];
        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
        mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
        mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
        mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
        mixed4 v12 = pos2-pos1;
        mixed4 v13 = pos3-pos1;
79
        pos.xyz = pos1.xyz + v12.xyz*weights.x + v13.xyz*weights.y + cross(v12, v13).xyz*weights.z;
80
        storePos(posq, posqCorrection, atoms.x, pos);
81
    }
82
83
84
85
    
    // Local coordinates sites.
    
    for (int index = get_global_id(0); index < NUM_LOCAL_COORDS; index += get_global_size(0)) {
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
        int siteAtomIndex = localCoordsIndex[index];
        int start = localCoordsStartIndex[index];
        int end = localCoordsStartIndex[index+1];
        mixed3 origin = 0, xdir = 0, ydir = 0;
        for (int j = start; j < end; j++) {
            mixed3 pos = loadPos(posq, posqCorrection, localCoordsAtoms[j]).xyz;
            origin += pos*localCoordsWeights[3*j];
            xdir += pos*localCoordsWeights[3*j+1];
            ydir += pos*localCoordsWeights[3*j+2];
        }
        mixed3 zdir = cross(xdir, ydir);
        mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
        mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
        mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
        mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
        xdir *= invNormXdir;
        zdir *= invNormZdir;
103
        ydir = cross(zdir, xdir);
104
        mixed3 localPosition = convert_mixed4(localCoordsPos[index]).xyz;
105
        mixed4 pos = loadPos(posq, posqCorrection, siteAtomIndex);
106
107
108
        pos.x = origin.x + xdir.x*localPosition.x + ydir.x*localPosition.y + zdir.x*localPosition.z;
        pos.y = origin.y + xdir.y*localPosition.x + ydir.y*localPosition.y + zdir.y*localPosition.z;
        pos.z = origin.z + xdir.z*localPosition.x + ydir.z*localPosition.y + zdir.z*localPosition.z;
109
        storePos(posq, posqCorrection, siteAtomIndex, pos);
110
    }
111
112
}

113
114
115
116
117
118
119
120
#ifdef HAS_OVERLAPPING_VSITES
    #ifdef SUPPORTS_64_BIT_ATOMICS
        // We will use 64 bit atomics for force redistribution.

        #define ADD_FORCE(index, f) addForce(index, f, longForce);

        #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable

121
        void addForce(int index, real4 f,  __global long* longForce) {
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
            atom_add(&longForce[index], (long) (f.x*0x100000000));
            atom_add(&longForce[index+PADDED_NUM_ATOMS], (long) (f.y*0x100000000));
            atom_add(&longForce[index+2*PADDED_NUM_ATOMS], (long) (f.z*0x100000000));
        }

        __kernel void addDistributedForces(__global const long* restrict longForces, __global real4* restrict forces) {
            real scale = 1/(real) 0x100000000;
            for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
                real4 f = (real4) (scale*longForces[index], scale*longForces[index+PADDED_NUM_ATOMS], scale*longForces[index+2*PADDED_NUM_ATOMS], 0);
                forces[index] += f;
            }
        }
    #else
        // 64 bit atomics aren't supported, so we have to use atomic_cmpxchg() for force redistribution.
        
        #define ADD_FORCE(index, f) addForce(index, f, force);

        void atomicAddFloat(__global float* p, float v) {
            __global int* ip = (__global int*) p;
            while (true) {
                union {
                    float f;
                    int i;
                } oldval, newval;
                oldval.f = *p;
                newval.f = oldval.f+v;
                if (atomic_cmpxchg(ip, oldval.i, newval.i) == oldval.i)
                    return;
            }
        }

        void addForce(int index, float4 f, __global float4* force) {
            __global float* components = (__global float*) force;
            atomicAddFloat(&components[4*index], f.x);
            atomicAddFloat(&components[4*index+1], f.y);
            atomicAddFloat(&components[4*index+2], f.z);
        }
    #endif
#else
    // There are no overlapping virtual sites, so we can just store forces directly.

    #define ADD_FORCE(index, f) force[index].xyz += (f).xyz;
#endif

166
167
168
/**
 * Distribute forces from virtual sites to the atoms they are based on.
 */
169
__kernel void distributeForces(__global const real4* restrict posq, __global real4* restrict force,
170
171
172
#ifdef SUPPORTS_64_BIT_ATOMICS
        __global long* restrict longForce,
#endif
173
174
175
#ifdef USE_MIXED_PRECISION
        __global real4* restrict posqCorrection,
#endif
176
177
        __global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights,
        __global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
178
        __global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights,
179
180
181
        __global const int* restrict localCoordsIndex, __global const int* restrict localCoordsAtoms,
        __global const real* restrict localCoordsWeights, __global const real4* restrict localCoordsPos,
        __global const int* restrict localCoordsStartIndex) {
182
183
184
185
#ifndef USE_MIXED_PRECISION
        __global real4* posqCorrection = 0;
#endif

186
187
188
189
    // Two particle average sites.
    
    for (int index = get_global_id(0); index < NUM_2_AVERAGE; index += get_global_size(0)) {
        int4 atoms = avg2Atoms[index];
190
191
        real2 weights = avg2Weights[index];
        real4 f = force[atoms.x];
192
193
        ADD_FORCE(atoms.y, f*weights.x);
        ADD_FORCE(atoms.z, f*weights.y);
194
195
196
197
198
199
    }
    
    // Three particle average sites.
    
    for (int index = get_global_id(0); index < NUM_3_AVERAGE; index += get_global_size(0)) {
        int4 atoms = avg3Atoms[index];
200
201
        real4 weights = avg3Weights[index];
        real4 f = force[atoms.x];
202
203
204
        ADD_FORCE(atoms.y, f*weights.x);
        ADD_FORCE(atoms.z, f*weights.y);
        ADD_FORCE(atoms.w, f*weights.z);
205
206
207
208
209
210
    }
    
    // Out of plane sites.
    
    for (int index = get_global_id(0); index < NUM_OUT_OF_PLANE; index += get_global_size(0)) {
        int4 atoms = outOfPlaneAtoms[index];
211
212
213
214
215
216
217
218
        real4 weights = outOfPlaneWeights[index];
        mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
        mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
        mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
        mixed4 v12 = pos2-pos1;
        mixed4 v13 = pos3-pos1;
        real4 f = force[atoms.x];
        real4 fp2 = (real4) (weights.x*f.x - weights.z*v13.z*f.y + weights.z*v13.y*f.z,
219
                   weights.z*v13.z*f.x + weights.x*f.y - weights.z*v13.x*f.z,
220
                  -weights.z*v13.y*f.x + weights.z*v13.x*f.y + weights.x*f.z, 0.0f);
221
        real4 fp3 = (real4) (weights.y*f.x + weights.z*v12.z*f.y - weights.z*v12.y*f.z,
222
223
                  -weights.z*v12.z*f.x + weights.y*f.y + weights.z*v12.x*f.z,
                   weights.z*v12.y*f.x - weights.z*v12.x*f.y + weights.y*f.z, 0.0f);
224
225
226
        ADD_FORCE(atoms.y, f-fp2-fp3);
        ADD_FORCE(atoms.z, fp2);
        ADD_FORCE(atoms.w, fp3);
227
    }
228
229
230
231
    
    // Local coordinates sites.
    
    for (int index = get_global_id(0); index < NUM_LOCAL_COORDS; index += get_global_size(0)) {
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
        int siteAtomIndex = localCoordsIndex[index];
        int start = localCoordsStartIndex[index];
        int end = localCoordsStartIndex[index+1];
        mixed3 origin = 0, xdir = 0, ydir = 0;
        for (int j = start; j < end; j++) {
            mixed3 pos = loadPos(posq, posqCorrection, localCoordsAtoms[j]).xyz;
            origin += pos*localCoordsWeights[3*j];
            xdir += pos*localCoordsWeights[3*j+1];
            ydir += pos*localCoordsWeights[3*j+2];
        }
        mixed3 zdir = cross(xdir, ydir);
        mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
        mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
        mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
        mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
        mixed3 dx = xdir*invNormXdir;
        mixed3 dz = zdir*invNormZdir;
        mixed3 dy = cross(dz, dx);
250
        mixed3 localPosition = convert_mixed4(localCoordsPos[index]).xyz;
251
252
253

        // The derivatives for this case are very complicated.  They were computed with SymPy then simplified by hand.

254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
        real4 f = force[siteAtomIndex];
        mixed3 fp1 = localPosition*f.x;
        mixed3 fp2 = localPosition*f.y;
        mixed3 fp3 = localPosition*f.z;
        for (int j = start; j < end; j++) {
            real originWeight = localCoordsWeights[3*j];
            real wx = localCoordsWeights[3*j+1];
            real wy = localCoordsWeights[3*j+2];
            mixed wxScaled = wx*invNormXdir;
            mixed t1 = (wx*ydir.x-wy*xdir.x)*invNormZdir;
            mixed t2 = (wx*ydir.y-wy*xdir.y)*invNormZdir;
            mixed t3 = (wx*ydir.z-wy*xdir.z)*invNormZdir;
            mixed sx = t3*dz.y-t2*dz.z;
            mixed sy = t1*dz.z-t3*dz.x;
            mixed sz = t2*dz.x-t1*dz.y;
269
            real4 fresult = 0;
270
271
272
273
274
275
276
277
278
279
280
            fresult.x += fp1.x*wxScaled*(1-dx.x*dx.x) + fp1.z*(dz.x*sx   ) + fp1.y*((-dx.x*dy.x     )*wxScaled + dy.x*sx - dx.y*t2 - dx.z*t3) + f.x*originWeight;
            fresult.y += fp1.x*wxScaled*( -dx.x*dx.y) + fp1.z*(dz.x*sy+t3) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled + dy.x*sy + dx.y*t1);
            fresult.z += fp1.x*wxScaled*( -dx.x*dx.z) + fp1.z*(dz.x*sz-t2) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled + dy.x*sz + dx.z*t1);
            fresult.x += fp2.x*wxScaled*( -dx.y*dx.x) + fp2.z*(dz.y*sx-t3) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled - dy.y*sx - dx.x*t2);
            fresult.y += fp2.x*wxScaled*(1-dx.y*dx.y) + fp2.z*(dz.y*sy   ) - fp2.y*(( dx.y*dy.y     )*wxScaled - dy.y*sy + dx.x*t1 + dx.z*t3) + f.y*originWeight;
            fresult.z += fp2.x*wxScaled*( -dx.y*dx.z) + fp2.z*(dz.y*sz+t1) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled - dy.y*sz - dx.z*t2);
            fresult.x += fp3.x*wxScaled*( -dx.z*dx.x) + fp3.z*(dz.z*sx+t2) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled + dy.z*sx + dx.x*t3);
            fresult.y += fp3.x*wxScaled*( -dx.z*dx.y) + fp3.z*(dz.z*sy-t1) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled + dy.z*sy + dx.y*t3);
            fresult.z += fp3.x*wxScaled*(1-dx.z*dx.z) + fp3.z*(dz.z*sz   ) + fp3.y*((-dx.z*dy.z     )*wxScaled + dy.z*sz - dx.x*t1 - dx.y*t2) + f.z*originWeight;
            ADD_FORCE(localCoordsAtoms[j], fresult);
        }
281
    }
282
}