kgbsa1.br 14.8 KB
Newer Older
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
1

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
2
3
4
5
6
7
8
9
10
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
* 
* This kernel was developed in collaboration with
* 
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//96 : 228 Flops
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
11
12
13
14
15
16
17
18
19
20
kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR,
                           float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>,
                           out float4 dGpol_dalpha2_ij<> ){
 
   // ---------------------------------------------------------------------------------------

   float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol;

   // ---------------------------------------------------------------------------------------

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
21
22
23
24
25
26
27
28
29
30
   r2                 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //20
   alpha2_ij          = jBornR*iBornR; //4
   D_ij               = r2/(4.0f*alpha2_ij); //8
   expTerm            = exp( -D_ij ); //4 : 80
   denominator2       = r2 + alpha2_ij*expTerm; //8 
   denominator        = sqrt( denominator2 ); //4 : 60
   Gpol               = jQ/denominator; //4
   Gpol              *= iQ;//4
   dGpol_dr           = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2;  //16
   dGpol_dalpha2_ij   = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2; //24
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

}

/* ---------------------------------------------------------------------------------------

   Calculate nonpolar ACE term (Simbios) 

   bornRadius:          Born radius
   vdwRadius:           Vdw radius
   duplicationFactor:   duplication factor
   aceForce:            ACE term

   --------------------------------------------------------------------------------------- */

kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor,
                                out float aceForce<> ){

   // ---------------------------------------------------------------------------------------

   // nonpolar term

   float iSurface;
   float iAceTerm;

   // ---------------------------------------------------------------------------------------

   // constants

   // solvent radius

   const float probeRadius       = 0.14f;

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
63
   // PI*24*0.0f049 (0.0f054=asolv from Tinker)
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
64
   //const float PI_24_aI          = -0.3694512961;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
65
   const float PI_24_aI          = -400.71504079f;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

   // ---------------------------------------------------------------------------------------

   // etch i position and partial charge

   // e = ai * term * (ri+probe)**2 * (ri/rb)**6
   // (drbi) = drb(i) - 6.0fd0*e/rb

   // (rI+probe)**2

   iSurface                     = (iVdwRadius+probeRadius);
   iSurface                     = iSurface*iSurface;

   // (rI/rB)**6

   iAceTerm                     = iVdwRadius/iBornRadius;
   iAceTerm                     = iAceTerm*iAceTerm*iAceTerm;
   iAceTerm                     = iAceTerm*iAceTerm;
   aceForce                     = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius);

}

/* ---------------------------------------------------------------------------------------

   Calculate first loop force terms  (Simbios) 

   numberOfAtoms:       no. of atoms
   roundedUpAtoms:      rounded up number of atoms -- accounts for unrolling
   duplicationFactor:   number of threads for inner loop
   streamWidth:         atom stream width
   fstreamWidth:        force stream width (output -- i-unroll)
   soluteDielectric:    solute dielectric
   solventDielectric:   solvent dielectric
   includeAce:          include ACE term 
   posq:                atom positions and charge
   bornRadii:           Born radii
   nonpolarForce:       nonpolar force (0 if nonpolar not included, else
                        ACE value)
   bornForce1:          i-unroll first force component, including dBornR/dr in .w
   bornForce2:          i-unroll second force component, including dBornR/dr in .w 
   bornForce3:          i-unroll first force component, including dBornR/dr in .w
   bornForce4:          i-unroll second force component, including dBornR/dr in .w 

   --------------------------------------------------------------------------------------- */
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
110
//FLOPS = 544
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
111
112
113
kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor, 
                       float streamWidth, float fstreamWidth, float soluteDielectric,
                       float solventDielectric, float includeAce,
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
114
                       float4 posq[][], float  bornRadii[][], float  atomicRadii[][], 
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
                       out float4 bornForce1<>, out float4 bornForce2<>,
                       out float4 bornForce3<>, out float4 bornForce4<> ){

   // ---------------------------------------------------------------------------------------

   // Born radii

   float i1BornR, i2BornR, i3BornR, i4BornR;
   float j1BornR, j2BornR, j3BornR, j4BornR;
   float4 jBornR;

   // atomic radii

   float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR;

   // i,j coordinates

   float3 i1Pos, i2Pos, i3Pos, i4Pos; 
   float3 j1Pos, j2Pos, j3Pos, j4Pos;
   float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ;

   // i, j partial charges

   float i1Q, i2Q, i3Q, i4Q;
   float j1Q, j2Q, j3Q, j4Q;
   float4 jQ;

   float aceForce;
  
   // delta coordinates

   float3 d1, d2, d3, d4;

   // intermediate terms

   float4 dGpol_dr, dGpol_dalpha2_ij;

   // indices

   float2 iAtom; 
   float forceIndex;

   // This is forceIndex mod numberOfAtoms, the true i index

   float iAtomLinearIndex, jLinind; 

   float2 jAtom;
   float jEnd, jStart, jBlock;
   float whichRep;
   float tmp; 

   // ---------------------------------------------------------------------------------------

   // electricConstant           = -166.0f2691;
   // preFactor                  = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric))
   float preFactor               = -332.05382f;
   const float I_Unroll          = 4.0f;
   const float3 zero3            = float3( 0.0f, 0.0f, 0.0f );

   // ---------------------------------------------------------------------------------------

   preFactor                    *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) );

   iAtom                         = indexof( bornForce1 );
   forceIndex                    = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );

   iAtomLinearIndex              = fmod( forceIndex, roundedUpAtoms );

   // ---------------------------------------------------------------------------------------

   // set gather index

   iAtom.x                       = fmod(  iAtomLinearIndex, streamWidth );
   iAtom.y                       = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth );

   // ---------------------------------------------------------------------------------------

   // etch i1 position and partial charge

   jQ                            = posq[          iAtom ];
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
195
196
   i1BornR                       = bornRadii[     iAtom ];
   i1AtomicR                     = atomicRadii[   iAtom ];
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
197
   i1Pos                         = jQ.xyz;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
198
   i1Q                           = jQ.w;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
199
200
201
202
203
204
205
206
207
208
209
   i1Q                          *= preFactor;
   kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce );
   bornForce1.xyz                = zero3;
   bornForce1.w                  = includeAce > 0.5f ? aceForce : 0.0f;

   // ---------------------------------------------------------------------------------------

   // etch i2 position and partial charge

   iAtom.x                      += 1;
   jQ                            = posq[          iAtom ];
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
210
211
   i2BornR                       = bornRadii[     iAtom ];
   i2AtomicR                     = atomicRadii[   iAtom ];
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
212
   i2Pos                         = jQ.xyz;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
213
   i2Q                           = jQ.w;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
214
215
216
217
218
219
220
221
222
223
224
   i2Q                          *= preFactor;
   kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce );
   bornForce2.xyz                = zero3;
   bornForce2.w                  = includeAce > 0.5f ? aceForce : 0.0f;

   // ---------------------------------------------------------------------------------------

   // etch i3 position and partial charge

   iAtom.x                      += 1;
   jQ                            = posq[          iAtom ];
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
225
226
   i3BornR                       = bornRadii[     iAtom ];
   i3AtomicR                     = atomicRadii[   iAtom ];
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
227
   i3Pos                         = jQ.xyz;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
228
   i3Q                           = jQ.w;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
229
230
231
232
233
234
235
236
237
238
239
240
   i3Q                          *= preFactor;
   kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce );
   bornForce3.xyz                = zero3;
   bornForce3.w                  = includeAce > 0.5f ? aceForce : 0.0f;

   // ---------------------------------------------------------------------------------------

   // etch i4 position and partial charge

   iAtom.x                      += 1;

   jQ                            = posq[          iAtom ];
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
241
242
   i4BornR                       = bornRadii[     iAtom ];
   i4AtomicR                     = atomicRadii[   iAtom ];
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
243
   i4Pos                         = jQ.xyz;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
244
   i4Q                           = jQ.w;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
   i4Q                          *= preFactor;
   kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce );
   bornForce4.xyz                = zero3;
   bornForce4.w                  = includeAce > 0.5f ? aceForce : 0.0f;

   // ---------------------------------------------------------------------------------------

   // inner loop setup

   // if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments
   // to increase number of threads in flight

   // forceStreamSz = N*RepFac/I_UnRoll
   // forceIndex       = I_UnRoll*( a.x + a.y*forceStreamSz )
   // whichRep      = 0 or 1
   // jBlock        = 1 + floor[ N/(duplicationFactor*streamWidth) ]

   //changed the following instruction for rounding issues on some ASICs
   //whichRep                     = floor( forceIndex / roundedUpAtoms );
   
   tmp = fmod(forceIndex, roundedUpAtoms);
   whichRep = round((forceIndex - tmp)/roundedUpAtoms);
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
267
268
269
270

   //jBlock                        = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
   tmp = fmod( numberOfAtoms, (duplicationFactor*streamWidth ) );   
   jBlock                       = 1 + round( (numberOfAtoms-tmp)/(duplicationFactor*streamWidth ));
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
   
   jStart                       = whichRep*jBlock;

   jEnd                         = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);

   jAtom.y                      = jStart;
   jLinind                      = jAtom.y*streamWidth;

   // ---------------------------------------------------------------------------------------

   while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind )  > 0.9f ){
      jAtom.x = 0.0f;
      while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) {

         // ---------------------------------------------------------------------------------------

         // gather required values

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
289
290
291
         j1PosQ             = posq[      jAtom ];
         j1Pos              = j1PosQ.xyz;
         j1Q                = j1PosQ.w;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
292
293
294
         j1BornR            = bornRadii[ jAtom ];
         jAtom.x           += 1.0f;

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
295
296
297
         j2PosQ             = posq[      jAtom ];
         j2Pos              = j2PosQ.xyz;
         j2Q                = j2PosQ.w;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
298
299
300
         j2BornR            = bornRadii[ jAtom ];
         jAtom.x           += 1.0f;

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
301
302
303
         j3PosQ             = posq[      jAtom ];
         j3Pos              = j3PosQ.xyz;
         j3Q                = j3PosQ.w;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
304
305
306
         j3BornR            = bornRadii[ jAtom ];
         jAtom.x           += 1.0f;

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
307
308
309
         j4PosQ             = posq[      jAtom ];
         j4Pos              = j4PosQ.xyz;
         j4Q                = j4PosQ.w;
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
310
311
312
313
314
315
316
317
318
319
         j4BornR            = bornRadii[ jAtom ];
         jAtom.x           += 1.0f;

         jBornR             = float4( j1BornR, j2BornR, j3BornR, j4BornR );
         jQ                 = float4( j1Q, j2Q, j3Q, j4Q );

         // ---------------------------------------------------------------------------------------

         // i == 1

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
320
321
322
323
         d1                 = i1Pos - j1Pos; //3
         d2                 = i1Pos - j2Pos; //3
         d3                 = i1Pos - j3Pos; //3
         d4                 = i1Pos - j4Pos; //3
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
324

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
325
         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
326
 
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
327
328
329
330
         bornForce1.xyz    += dGpol_dr.x*d1; //6
         bornForce1.xyz    += dGpol_dr.y*d2; //6
         bornForce1.xyz    += dGpol_dr.z*d3; //6
         bornForce1.xyz    += dGpol_dr.w*d4; //6
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
331

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
332
         bornForce1.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
333
334
335
336
337

         // ---------------------------------------------------------------------------------------

         // i == 2

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
338
339
340
341
         d1                 = i2Pos - j1Pos; //3
         d2                 = i2Pos - j2Pos; //3
         d3                 = i2Pos - j3Pos; //3
         d4                 = i2Pos - j4Pos; //3
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
342

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
343
         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
344

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
345
346
347
348
         bornForce2.xyz    += dGpol_dr.x*d1; //6
         bornForce2.xyz    += dGpol_dr.y*d2; //6
         bornForce2.xyz    += dGpol_dr.z*d3; //6
         bornForce2.xyz    += dGpol_dr.w*d4; //6
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
349

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
350
         bornForce2.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
351
352
353
354
355

         // ---------------------------------------------------------------------------------------

         // i == 3

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
356
357
358
359
         d1                 = i3Pos - j1Pos; //3
         d2                 = i3Pos - j2Pos; //3
         d3                 = i3Pos - j3Pos; //3
         d4                 = i3Pos - j4Pos; //3
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
360

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
361
         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
362

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
363
364
365
366
         bornForce3.xyz    += dGpol_dr.x*d1; //6
         bornForce3.xyz    += dGpol_dr.y*d2; //6
         bornForce3.xyz    += dGpol_dr.z*d3; //6
         bornForce3.xyz    += dGpol_dr.w*d4; //6 
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
367

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
368
         bornForce3.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
369
370
371
372
373

         // ---------------------------------------------------------------------------------------

         // i == 4

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
374
375
376
377
         d1                 = i4Pos - j1Pos; //3
         d2                 = i4Pos - j2Pos; //3
         d3                 = i4Pos - j3Pos; //3
         d4                 = i4Pos - j4Pos; //3
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
378

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
379
         loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij ); //96 : 228
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
380

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
381
382
383
384
         bornForce4.xyz    += dGpol_dr.x*d1; //6 
         bornForce4.xyz    += dGpol_dr.y*d2; //6 
         bornForce4.xyz    += dGpol_dr.z*d3; //6 
         bornForce4.xyz    += dGpol_dr.w*d4; //6 
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
385

Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
386
         bornForce4.w      += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w; //4
Mark Friedrichs's avatar
Mods  
Mark Friedrichs committed
387
388
389
390
391
392
393
394
395

         // ---------------------------------------------------------------------------------------

         jLinind    += 4.0f;
      }
      jAtom.y       += 1.0f;      
   }

}