kCalculateAmoebaCudaElectrostatic.cu 42.8 KB
Newer Older
Mark Friedrichs's avatar
Mark Friedrichs committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
//-----------------------------------------------------------------------------------------

//-----------------------------------------------------------------------------------------

#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"

//#define AMOEBA_DEBUG

static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

void SetCalculateAmoebaElectrostaticSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}

void GetCalculateAmoebaElectrostaticSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}

static int const PScaleIndex            =  0; 
static int const DScaleIndex            =  1; 
static int const UScaleIndex            =  2; 
static int const MScaleIndex            =  3;
static int const Scale3Index            =  4;
static int const Scale5Index            =  5;
static int const Scale7Index            =  6;
static int const Scale9Index            =  7;
static int const Ddsc30Index            =  8;
//static int const Ddsc31Index            =  9;
//static int const Ddsc32Index            = 10; 
static int const Ddsc50Index            = 11;
//static int const Ddsc51Index            = 12;
//static int const Ddsc52Index            = 13; 
static int const Ddsc70Index            = 14;
//static int const Ddsc71Index            = 15;
//static int const Ddsc72Index            = 16;
51
52
53
54
55
56
57
static int const LastScalingIndex       = 17;

#define DOT3_4(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))

#define MATRIXDOT31(u,v) u[0]*v[0] + u[1]*v[1] + u[2]*v[2] + \
  u[3]*v[3] + u[4]*v[4] + u[5]*v[5] + \
  u[6]*v[6] + u[7]*v[7] + u[8]*v[8]
Mark Friedrichs's avatar
Mark Friedrichs committed
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

#define DOT31(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))

#define i35 0.257142857f
#define one 1.0f

__device__ void acrossProductVector3(   float* vectorX, float* vectorY, float* vectorZ ){
    vectorZ[0]  = vectorX[1]*vectorY[2] - vectorX[2]*vectorY[1];
    vectorZ[1]  = vectorX[2]*vectorY[0] - vectorX[0]*vectorY[2];
    vectorZ[2]  = vectorX[0]*vectorY[1] - vectorX[1]*vectorY[0];
}

__device__ void amatrixProductVector3(   float* matrixX, float* vectorY, float* vectorZ ){
    vectorZ[0]  = matrixX[0]*vectorY[0] + matrixX[3]*vectorY[1] + matrixX[6]*vectorY[2];
    vectorZ[1]  = matrixX[1]*vectorY[0] + matrixX[4]*vectorY[1] + matrixX[7]*vectorY[2];
    vectorZ[2]  = matrixX[2]*vectorY[0] + matrixX[5]*vectorY[1] + matrixX[8]*vectorY[2];
}

__device__ void amatrixCrossProductMatrix3( float* matrixX, float* matrixY, float* vectorZ ){
  
    float* xPtr[3];
    float* yPtr[3];
        
    xPtr[0]    = matrixX;
    xPtr[1]    = matrixX + 3;
    xPtr[2]    = matrixX + 6;
    
    yPtr[0]    = matrixY;
    yPtr[1]    = matrixY + 3;
    yPtr[2]    = matrixY + 6;
          
    vectorZ[0] = DOT31( xPtr[1], yPtr[2] ) - DOT31( xPtr[2], yPtr[1] );
    vectorZ[1] = DOT31( xPtr[2], yPtr[0] ) - DOT31( xPtr[0], yPtr[2] );
    vectorZ[2] = DOT31( xPtr[0], yPtr[1] ) - DOT31( xPtr[1], yPtr[0] );
  
}

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
struct ElectrostaticParticle {

    // coordinates charge

    float x;
    float y;
    float z;
    float q;

    // lab frame dipole

    float labFrameDipole[3];

    // lab frame quadrupole

    float labFrameQuadrupole[9];

    // induced dipole

    float inducedDipole[3];

    // polar induced dipole

    float inducedDipoleP[3];

    // scaling factors

    float thole;
    float damp;

    float force[3];

    float torque[3];
    float padding;

};

__device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& atomI,   ElectrostaticParticle& atomJ,
133
                                                      float* scalingFactors, float4*  outputForce, float4  outputTorque[2]
Mark Friedrichs's avatar
Mark Friedrichs committed
134
135
136
137
138
#ifdef AMOEBA_DEBUG
                                                      ,float4* debugArray 
#endif
 ){
  
139
    float deltaR[3];
Mark Friedrichs's avatar
Mark Friedrichs committed
140
141
142
143
144
145
146
147
148
    
    // ---------------------------------------------------------------------------------------
    
    // ---------------------------------------------------------------------------------------

    float* ddsc3                    =  scalingFactors + Ddsc30Index;
    float* ddsc5                    =  scalingFactors + Ddsc50Index;
    float* ddsc7                    =  scalingFactors + Ddsc70Index;

149
150
151
    deltaR[0]                       = atomJ.x - atomI.x;
    deltaR[1]                       = atomJ.y - atomI.y;
    deltaR[2]                       = atomJ.z - atomI.z;
Mark Friedrichs's avatar
Mark Friedrichs committed
152
153
154
155
156
157
158
159
160
161
162
163
164

    float r2                        = DOT31( deltaR, deltaR );
    float r                         = sqrtf( r2 );
    float rr1                       = 1.0f/r;
    float rr2                       = rr1*rr1;
    float rr3                       = rr1*rr2;
    float rr5                       = 3.0f*rr3*rr2;
    float rr7                       = 5.0f*rr5*rr2;
    float rr9                       = 7.0f*rr7*rr2;
    float rr11                      = 9.0f*rr9*rr2;

    //-------------------------------------------

165
    if( atomI.damp != 0.0f && atomJ.damp != 0.0 && r < cAmoebaSim.scalingDistanceCutoff ){
Mark Friedrichs's avatar
Mark Friedrichs committed
166
167
168
169
170
   
        float distanceIJ, r2I;
        distanceIJ                    = r;
        r2I                           = rr2;
        
171
172
        float ratio                   = distanceIJ/(atomI.damp*atomJ.damp);
        float pGamma                  = atomJ.thole > atomI.thole ? atomI.thole : atomJ.thole;
Mark Friedrichs's avatar
Mark Friedrichs committed
173

174
        float damp                          = ratio*ratio*ratio*pGamma;
Mark Friedrichs's avatar
Mark Friedrichs committed
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
        float dampExp                 = expf( -damp );
        float damp1                   = damp + one;
        float damp2                   = damp*damp;
        float damp3                   = damp2*damp;

        scalingFactors[Scale3Index]   = one - dampExp;
        scalingFactors[Scale5Index]   = one - damp1*dampExp;
        scalingFactors[Scale7Index]   = one - ( damp1 + 0.6f*damp2)*dampExp;
        scalingFactors[Scale9Index]   = one - ( damp1 + ( 2.0f*damp2 + damp3 )*i35)*dampExp;

        float factor                  = 3.0f*damp*dampExp*r2I;
        float factor7                 = -0.2f + 0.6f*damp;
        
        for( int ii = 0; ii < 3; ii++ ){
            scalingFactors[Ddsc30Index + ii] = factor*deltaR[ii];
            scalingFactors[Ddsc50Index + ii] = scalingFactors[Ddsc30Index + ii]*damp;
            scalingFactors[Ddsc70Index + ii] = scalingFactors[Ddsc50Index + ii]*factor7;
        }

    }
      
Peter Eastman's avatar
Peter Eastman committed
196
197
198
199
200
201
202
203
    float scaleI0 = scalingFactors[Scale3Index]*scalingFactors[UScaleIndex];
    float dsc0    = scalingFactors[Scale3Index]*scalingFactors[DScaleIndex];
    float psc0    = scalingFactors[Scale3Index]*scalingFactors[PScaleIndex];
    float scaleI1 = scalingFactors[Scale3Index+1]*scalingFactors[UScaleIndex];
    float dsc1    = scalingFactors[Scale3Index+1]*scalingFactors[DScaleIndex];
    float psc1    = scalingFactors[Scale3Index+1]*scalingFactors[PScaleIndex];
    float dsc2    = scalingFactors[Scale3Index+2]*scalingFactors[DScaleIndex];
    float psc2    = scalingFactors[Scale3Index+2]*scalingFactors[PScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
204
                       
205
    float qIr[3], qJr[3];
Mark Friedrichs's avatar
Mark Friedrichs committed
206

207
208
    amatrixProductVector3( atomJ.labFrameQuadrupole,      deltaR,      qJr);
    amatrixProductVector3( atomI.labFrameQuadrupole,      deltaR,      qIr);
Mark Friedrichs's avatar
Mark Friedrichs committed
209

Peter Eastman's avatar
Peter Eastman committed
210
211
212
    float sc2     = DOT3_4(        atomI.labFrameDipole,  atomJ.labFrameDipole );
    float sc3     = DOT3_4(        atomI.labFrameDipole,  deltaR  );
    float sc4     = DOT3_4(        atomJ.labFrameDipole,  deltaR  );
Mark Friedrichs's avatar
Mark Friedrichs committed
213
    
Peter Eastman's avatar
Peter Eastman committed
214
215
    float sc5     = DOT3_4(        qIr, deltaR  );
    float sc6     = DOT3_4(        qJr, deltaR  );
Mark Friedrichs's avatar
Mark Friedrichs committed
216
    
Peter Eastman's avatar
Peter Eastman committed
217
218
    float sc7     = DOT3_4(        qIr, atomJ.labFrameDipole );
    float sc8     = DOT3_4(        qJr, atomI.labFrameDipole );
Mark Friedrichs's avatar
Mark Friedrichs committed
219
    
Peter Eastman's avatar
Peter Eastman committed
220
    float sc9     = DOT3_4(        qIr, qJr );
Mark Friedrichs's avatar
Mark Friedrichs committed
221
    
Peter Eastman's avatar
Peter Eastman committed
222
    float sc10    = MATRIXDOT31( atomI.labFrameQuadrupole, atomJ.labFrameQuadrupole );
Mark Friedrichs's avatar
Mark Friedrichs committed
223
    
Peter Eastman's avatar
Peter Eastman committed
224
225
226
227
228
    float sci1    = DOT3_4(        atomI.inducedDipole,  atomJ.labFrameDipole ) +
                    DOT3_4(        atomJ.inducedDipole,  atomI.labFrameDipole );
        
    float sci3    = DOT3_4(        atomI.inducedDipole,  deltaR  );
    float sci4    = DOT3_4(        atomJ.inducedDipole,  deltaR  );
Mark Friedrichs's avatar
Mark Friedrichs committed
229
    
Peter Eastman's avatar
Peter Eastman committed
230
231
    float sci7    = DOT3_4(        qIr, atomJ.inducedDipole );
    float sci8    = DOT3_4(        qJr, atomI.inducedDipole );
Mark Friedrichs's avatar
Mark Friedrichs committed
232
    
Peter Eastman's avatar
Peter Eastman committed
233
234
    float scip1   = DOT3_4(        atomI.inducedDipoleP, atomJ.labFrameDipole ) +
                    DOT3_4(        atomJ.inducedDipoleP, atomI.labFrameDipole );
Mark Friedrichs's avatar
Mark Friedrichs committed
235
    
Peter Eastman's avatar
Peter Eastman committed
236
237
    float scip2   = DOT3_4(        atomI.inducedDipole,  atomJ.inducedDipoleP) +
                    DOT3_4(        atomJ.inducedDipole,  atomI.inducedDipoleP);
Mark Friedrichs's avatar
Mark Friedrichs committed
238
    
Peter Eastman's avatar
Peter Eastman committed
239
240
    float scip3   = DOT3_4(        atomI.inducedDipoleP, deltaR );
    float scip4   = DOT3_4(        atomJ.inducedDipoleP, deltaR );
Mark Friedrichs's avatar
Mark Friedrichs committed
241
    
Peter Eastman's avatar
Peter Eastman committed
242
243
    float scip7   = DOT3_4(        qIr, atomJ.inducedDipoleP );
    float scip8   = DOT3_4(        qJr, atomI.inducedDipoleP );
Mark Friedrichs's avatar
Mark Friedrichs committed
244

245
246
247
248
249
250
    float scaleF             = 0.5f*scalingFactors[UScaleIndex];
    float inducedFactor3     = scip2*rr3*scaleF;
    float inducedFactor5     = (sci3*scip4+scip3*sci4)*rr5*scaleF;
    float findmp_0           = inducedFactor3*ddsc3[0] - inducedFactor5*ddsc5[0];
    float findmp_1           = inducedFactor3*ddsc3[1] - inducedFactor5*ddsc5[1];
    float findmp_2           = inducedFactor3*ddsc3[2] - inducedFactor5*ddsc5[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
251

Peter Eastman's avatar
Peter Eastman committed
252
253
254
255
256
    float gli1               = atomJ.q*sci3 - atomI.q*sci4;
    float gli2               = -sc3*sci4 - sci3*sc4;
    float gli3               = sci3*sc6 - sci4*sc5;
    float gli6               = sci1;
    float gli7               = 2.0f*(sci7-sci8);
Mark Friedrichs's avatar
Mark Friedrichs committed
257
    
Peter Eastman's avatar
Peter Eastman committed
258
259
260
261
262
    float glip1              = atomJ.q*scip3 - atomI.q*scip4;
    float glip2              = -sc3*scip4 - scip3*sc4;
    float glip3              = scip3*sc6 - scip4*sc5;
    float glip6              = scip1;
    float glip7              = 2.0f*(scip7-scip8);
Mark Friedrichs's avatar
Mark Friedrichs committed
263
    
264
265
266
    float factor3            = rr3*(( gli1  +  gli6)*scalingFactors[PScaleIndex] + (glip1  + glip6)*scalingFactors[DScaleIndex]);
    float factor5            = rr5*(( gli2  +  gli7)*scalingFactors[PScaleIndex] + (glip2  + glip7)*scalingFactors[DScaleIndex]);
    float factor7            = rr7*( gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
Mark Friedrichs's avatar
Mark Friedrichs committed
267
      
268
269
270
    float fridmp_0           = 0.5f*(factor3*ddsc3[0] + factor5*ddsc5[0] + factor7*ddsc7[0]);
    float fridmp_1           = 0.5f*(factor3*ddsc3[1] + factor5*ddsc5[1] + factor7*ddsc7[1]);
    float fridmp_2           = 0.5f*(factor3*ddsc3[2] + factor5*ddsc5[2] + factor7*ddsc7[2]);
Mark Friedrichs's avatar
Mark Friedrichs committed
271
      
Peter Eastman's avatar
Peter Eastman committed
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
    float gl0 = atomI.q*atomJ.q;
    float gl1 = atomJ.q*sc3 - atomI.q*sc4;
    float gl2 = atomI.q*sc6 + atomJ.q*sc5 - sc3*sc4;
    float gl3 = sc3*sc6 - sc4*sc5;
    float gl4 = sc5*sc6;
    float gl6 = sc2;
    float gl7 = 2.0f*(sc7-sc8);
    float gl8 = 2.0f*sc10;
    float gl5 = -4.0f*sc9;
    
    float gf1 = rr3*gl0 + rr5*(gl1+gl6) + rr7*(gl2+gl7+gl8) + rr9*(gl3+gl5) + rr11*gl4;
    float gf2 = -atomJ.q*rr3 + sc4*rr5 - sc6*rr7;
    float gf3 =  atomI.q*rr3 + sc3*rr5 + sc5*rr7;
    float gf4 = 2.0f*rr5;
    float gf5 = 2.0f*(-atomJ.q*rr5+sc4*rr7-sc6*rr9);
    float gf6 = 2.0f*(-atomI.q*rr5-sc3*rr7-sc5*rr9);
    float gf7 = 4.0f*rr7;
Mark Friedrichs's avatar
Mark Friedrichs committed
289
290
291

    // energy

Peter Eastman's avatar
Peter Eastman committed
292
293
    float em                 = scalingFactors[MScaleIndex]*(rr1*gl0 + rr3*(gl1+gl6) + rr5*(gl2+gl7+gl8) + rr7*(gl3+gl5) + rr9*gl4);
    float ei                 = 0.5f*(rr3*(gli1+gli6)*psc0 + rr5*(gli2+gli7)*psc1 + rr7*gli3*psc2);
294
    outputForce->w           = em+ei;
Mark Friedrichs's avatar
Mark Friedrichs committed
295
296
297
298
299
    
#ifdef AMOEBA_DEBUG
#if 0
if( 1 ){
    int debugIndex           = 0;
300
301
    debugArray[debugIndex].x = em;
    debugArray[debugIndex].y = ei;
Mark Friedrichs's avatar
Mark Friedrichs committed
302
303
304
305
    debugArray[debugIndex].z = rr1;
    debugArray[debugIndex].w = rr3;

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
306
307
308
309
    debugArray[debugIndex].x = gl0;
    debugArray[debugIndex].y = gl1;
    debugArray[debugIndex].z = gl6;
    debugArray[debugIndex].w = gl2;
Mark Friedrichs's avatar
Mark Friedrichs committed
310
311

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
312
313
314
315
    debugArray[debugIndex].x = gli1;
    debugArray[debugIndex].y = gli3;
    debugArray[debugIndex].z = gli2;
    debugArray[debugIndex].w = gli7;
Mark Friedrichs's avatar
Mark Friedrichs committed
316
317

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
318
319
320
    debugArray[debugIndex].x = psc0;
    debugArray[debugIndex].y = psc1;
    debugArray[debugIndex].z = psc2;
Mark Friedrichs's avatar
Mark Friedrichs committed
321
322
323
324
325
326
327
    debugArray[debugIndex].w = scalingFactors[MScaleIndex];

}
#endif
#endif

    float temp1[3],temp2[3],temp3[3];
328
329
330
    float qIqJr[3], qJqIr[3], qIdJ[3], qJdI[3];
    amatrixProductVector3( atomI.labFrameQuadrupole,      atomJ.labFrameDipole,     qIdJ );//MK
    amatrixProductVector3( atomJ.labFrameQuadrupole,      atomI.labFrameDipole,     qJdI );//MK
Mark Friedrichs's avatar
Mark Friedrichs committed
331

332
333
334
335
    amatrixProductVector3( atomI.labFrameQuadrupole,      qJr,    qIqJr );//MK
    amatrixProductVector3( atomJ.labFrameQuadrupole,      qIr,    qJqIr );//MK
    amatrixProductVector3( atomJ.labFrameQuadrupole,      qIr,    temp1 );
    amatrixProductVector3( atomJ.labFrameQuadrupole,      atomI.labFrameDipole,     temp2 );
Mark Friedrichs's avatar
Mark Friedrichs committed
336

337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
    float ftm2_0 = gf1*deltaR[0] +
                     gf2*atomI.labFrameDipole[0] + gf3*atomJ.labFrameDipole[0]  +
                     gf4*(temp2[0]  - qIdJ[0])   +
                     gf5*qIr[0]    + gf6*qJr[0]  +
                     gf7*(qIqJr[0] + temp1[0]);
    
    float ftm2_1 = gf1*deltaR[1]                 +
                     gf2*atomI.labFrameDipole[1] + gf3*atomJ.labFrameDipole[1]  +
                     gf4*(temp2[1]  - qIdJ[1])   +
                     gf5*qIr[1]    + gf6*qJr[1]  +
                     gf7*(qIqJr[1] + temp1[1]);
    
    float ftm2_2 = gf1*deltaR[2]                 +
                     gf2*atomI.labFrameDipole[2] + gf3*atomJ.labFrameDipole[2]  +
                     gf4*(temp2[2]  - qIdJ[2])   +
                     gf5*qIr[2]    + gf6*qJr[2]  +
                     gf7*(qIqJr[2] + temp1[2]);
Mark Friedrichs's avatar
Mark Friedrichs committed
354
355
356
357
358
359
    

    // get the induced force;

    // intermediate variables for the induced-permanent terms;
    
Peter Eastman's avatar
Peter Eastman committed
360
361
362
363
364
    float gfi1 = rr5*0.5f*((gli1+gli6)*psc0 + (glip1+glip6)*dsc0 + scip2*scaleI0) + rr7*((gli7+gli2)*psc1 + (glip7+glip2)*dsc1 -
                                                       (sci3*scip4+scip3*sci4)*scaleI1)*0.5f + rr9*(gli3*psc2+glip3*dsc2)*0.5f;
    float gfi4 = 2.0f*rr5;
    float gfi5 = rr7* (sci4*psc2 + scip4*dsc2);
    float gfi6 = -rr7*(sci3*psc2 + scip3*dsc2);
Mark Friedrichs's avatar
Mark Friedrichs committed
365
366
367
368
369
370
371
372
373
374
375
376
377
378


    float temp4[3];
    float temp5[3];
    float temp6[3];
    float temp7[3];
    float temp8[3];
    float temp9[3];
    float temp10[3];
    float temp11[3];
    float temp12[3];
    float temp13[3];
    float temp14[3];
    float temp15[3];
379
380
    float qIuJp[3], qJuIp[3];
    float qIuJ[3], qJuI[3];
Mark Friedrichs's avatar
Mark Friedrichs committed
381

382
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipoleP,    temp4);
Mark Friedrichs's avatar
Mark Friedrichs committed
383

384
385
386
    amatrixProductVector3(atomI.labFrameQuadrupole,      atomJ.inducedDipoleP,    qIuJp);//MK
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipoleP,    qJuIp);//MK
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipole ,    qJuI);//MK
Mark Friedrichs's avatar
Mark Friedrichs committed
387

388
389
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipole,    temp5);
    amatrixProductVector3(atomI.labFrameQuadrupole,      atomJ.inducedDipole ,     qIuJ);//MK
Mark Friedrichs's avatar
Mark Friedrichs committed
390

391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
/*
               ftm2i(1) = gfi(1)*xr + 0.5d0*
     &           (- rr3*ck*(uind(1,i)*psc3+uinp(1,i)*dsc3)
     &            + rr5*sc(4)*(uind(1,i)*psc5+uinp(1,i)*dsc5)
     &            - rr7*sc(6)*(uind(1,i)*psc7+uinp(1,i)*dsc7))
     &            + (rr3*ci*(uind(1,k)*psc3+uinp(1,k)*dsc3)
     &            + rr5*sc(3)*(uind(1,k)*psc5+uinp(1,k)*dsc5)
     &            + rr7*sc(5)*(uind(1,k)*psc7+uinp(1,k)*dsc7))*0.5d0
     &            + rr5*scale5i*(sci(4)*uinp(1,i)+scip(4)*uind(1,i)
     &            + sci(3)*uinp(1,k)+scip(3)*uind(1,k))*0.5d0
     &            + 0.5d0*(sci(4)*psc5+scip(4)*dsc5)*rr5*di(1)
     &            + 0.5d0*(sci(3)*psc5+scip(3)*dsc5)*rr5*dk(1)
     &            + 0.5d0*gfi(4)*((qkui(1)-qiuk(1))*psc5
     &            + (qkuip(1)-qiukp(1))*dsc5)
     &            + gfi(5)*qir(1) + gfi(6)*qkr(1)
*/

    float ftm2i_0 = gfi1*deltaR[0] +
                    0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[0]*psc0 + atomI.inducedDipoleP[0]*dsc0) +
                    rr5*sc4*(atomI.inducedDipole[0]*psc1 + atomI.inducedDipoleP[0]*dsc1) -
                    rr7*sc6*(atomI.inducedDipole[0]*psc2 + atomI.inducedDipoleP[0]*dsc2)) +
      
                   (rr3*atomI.q*(atomJ.inducedDipole[0]*psc0+atomJ.inducedDipoleP[0]*dsc0) +
                     rr5*sc3*(atomJ.inducedDipole[0]*psc1 +atomJ.inducedDipoleP[0]*dsc1) +
                     rr7*sc5*(atomJ.inducedDipole[0]*psc2 +atomJ.inducedDipoleP[0]*dsc2))*0.5f +
                     rr5*scaleI1*(sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0] +
                     sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0])*0.5f +
      
                    0.5f*(sci4*psc1+scip4*dsc1)*rr5*atomI.labFrameDipole[0] +
                    0.5f*(sci3*psc1+scip3*dsc1)*rr5*atomJ.labFrameDipole[0] +
                    0.5f*gfi4*((temp5[0]-qIuJ[0])*psc1 +
                    (temp4[0]-qIuJp[0])*dsc1) + gfi5*qIr[0] + gfi6*qJr[0];
      
    float ftm2i_1  = gfi1*deltaR[1] +
                    0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[1]*psc0 + atomI.inducedDipoleP[1]*dsc0) +
                    rr5*sc4*(atomI.inducedDipole[1]*psc1 + atomI.inducedDipoleP[1]*dsc1) -
                    rr7*sc6*(atomI.inducedDipole[1]*psc2 + atomI.inducedDipoleP[1]*dsc2)) +
      
                    (rr3*atomI.q*(atomJ.inducedDipole[1]*psc0+atomJ.inducedDipoleP[1]*dsc0) +
                     rr5*sc3*(atomJ.inducedDipole[1]*psc1 +atomJ.inducedDipoleP[1]*dsc1) +
                     rr7*sc5*(atomJ.inducedDipole[1]*psc2 +atomJ.inducedDipoleP[1]*dsc2))*0.5f +
                     rr5*scaleI1*(sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1] +
                     sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1])*0.5f +
      
                    0.5f*(sci4*psc1+scip4*dsc1)*rr5*atomI.labFrameDipole[1] +
                    0.5f*(sci3*psc1+scip3*dsc1)*rr5*atomJ.labFrameDipole[1] +
                    0.5f*gfi4*((temp5[1]-qIuJ[1])*psc1 +
                    (temp4[1]-qIuJp[1])*dsc1) + gfi5*qIr[1] + gfi6*qJr[1];
      
    float ftm2i_2  = gfi1*deltaR[2] +
                    0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[2]*psc0 + atomI.inducedDipoleP[2]*dsc0) +
                    rr5*sc4*(atomI.inducedDipole[2]*psc1 + atomI.inducedDipoleP[2]*dsc1) -
                    rr7*sc6*(atomI.inducedDipole[2]*psc2 + atomI.inducedDipoleP[2]*dsc2)) +
      
                    (rr3*atomI.q*(atomJ.inducedDipole[2]*psc0+atomJ.inducedDipoleP[2]*dsc0) +
                     rr5*sc3*(atomJ.inducedDipole[2]*psc1 +atomJ.inducedDipoleP[2]*dsc1) +
                     rr7*sc5*(atomJ.inducedDipole[2]*psc2 +atomJ.inducedDipoleP[2]*dsc2))*0.5f +
                     rr5*scaleI1*(sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2] +
                     sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2])*0.5f +
      
                    0.5f*(sci4*psc1+scip4*dsc1)*rr5*atomI.labFrameDipole[2] +
                    0.5f*(sci3*psc1+scip3*dsc1)*rr5*atomJ.labFrameDipole[2] +
                    0.5f*gfi4*((temp5[2]-qIuJ[2])*psc1 +
                    (temp4[2]-qIuJp[2])*dsc1) + gfi5*qIr[2] + gfi6*qJr[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
455
456
457
458

    // handle of scaling for partially excluded interactions;
    // correction to convert mutual to direct polarization force;
    
459
460
461
    ftm2i_0 -= (fridmp_0 + findmp_0);
    ftm2i_1 -= (fridmp_1 + findmp_1);
    ftm2i_2 -= (fridmp_2 + findmp_2);
Mark Friedrichs's avatar
Mark Friedrichs committed
462
463
464
465
    
    // now perform the torque calculation;
    // intermediate terms for torque between multipoles i and j;
    
Peter Eastman's avatar
Peter Eastman committed
466
467
468
469
470
    float gti2 = 0.5f*(sci4*psc1+scip4*dsc1)*rr5;
    float gti3 = 0.5f*(sci3*psc1+scip3*dsc1)*rr5;
    float gti4 = gfi4;
    float gti5 = gfi5;
    float gti6 = gfi6;
Mark Friedrichs's avatar
Mark Friedrichs committed
471
472
473

    // get the permanent (ttm2, ttm3) and induced interaction torques (ttm2i, ttm3i)
    
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
    acrossProductVector3(atomI.labFrameDipole,      atomJ.labFrameDipole,      temp1);
    acrossProductVector3(atomI.labFrameDipole,      atomJ.inducedDipole ,      temp2);
    acrossProductVector3(atomI.labFrameDipole,      atomJ.inducedDipoleP,     temp3);
    acrossProductVector3(atomI.labFrameDipole,      deltaR,       temp4);
    acrossProductVector3(deltaR,       qIuJp,   temp5);
    acrossProductVector3(deltaR,       qIr,     temp6);
    acrossProductVector3(deltaR,       qIuJ,    temp7);
    acrossProductVector3(atomJ.inducedDipole ,     qIr,     temp8);
    acrossProductVector3(atomJ.inducedDipoleP,     qIr,     temp9);
    acrossProductVector3(atomI.labFrameDipole,     qJr,     temp10);
    acrossProductVector3(atomJ.labFrameDipole,     qIr,     temp11);
    acrossProductVector3(deltaR,       qIqJr,   temp12);
    acrossProductVector3(deltaR,       qIdJ,    temp13);

    amatrixCrossProductMatrix3(atomI.labFrameQuadrupole,      atomJ.labFrameQuadrupole,      temp14);
    acrossProductVector3(qJr, qIr,     temp15);
Mark Friedrichs's avatar
Mark Friedrichs committed
490

Peter Eastman's avatar
Peter Eastman committed
491
492
493
494
495
496
    float ttm2_0  = -rr3*temp1[0] + gf2*temp4[0]-gf5*temp6[0] + gf4*(temp10[0] + temp11[0] + temp13[0]-2.0f*temp14[0]) - gf7*(temp12[0] + temp15[0]);
    float ttm2i_0 = -rr3*(temp2[0]*psc0+temp3[0]*dsc0)*0.5f + gti2*temp4[0] + gti4*((temp8[0]+ temp7[0])*psc1 + (temp9[0] + temp5[0])*dsc1)*0.5f - gti5*temp6[0];
    float ttm2_1  = -rr3*temp1[1] + gf2*temp4[1]-gf5*temp6[1] + gf4*(temp10[1] + temp11[1] + temp13[1]-2.0f*temp14[1]) - gf7*(temp12[1] + temp15[1]);
    float ttm2i_1 = -rr3*(temp2[1]*psc0+temp3[1]*dsc0)*0.5f + gti2*temp4[1] + gti4*((temp8[1]+ temp7[1])*psc1 + (temp9[1] + temp5[1])*dsc1)*0.5f - gti5*temp6[1];
    float ttm2_2  = -rr3*temp1[2] + gf2*temp4[2]-gf5*temp6[2] + gf4*(temp10[2] + temp11[2] + temp13[2]-2.0f*temp14[2]) - gf7*(temp12[2] + temp15[2]);
    float ttm2i_2 = -rr3*(temp2[2]*psc0+temp3[2]*dsc0)*0.5f + gti2*temp4[2] + gti4*((temp8[2]+ temp7[2])*psc1 + (temp9[2] + temp5[2])*dsc1)*0.5f - gti5*temp6[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
497

498
499
500
501
502
503
504
505
506
507
508
509
510
511
    acrossProductVector3(atomJ.labFrameDipole,      deltaR,       temp2  );
    acrossProductVector3(deltaR,       qJr,     temp3  );
    acrossProductVector3(atomI.labFrameDipole,      qJr,     temp4  );
    acrossProductVector3(atomJ.labFrameDipole,      qIr,     temp5  );
    acrossProductVector3(deltaR,       qJdI,    temp6  );
    acrossProductVector3(deltaR,       qJqIr,   temp7  );
    acrossProductVector3(qJr,     qIr,     temp8  ); // _qJrxqIr
    acrossProductVector3(atomJ.labFrameDipole,      atomI.inducedDipole ,      temp9  ); // _dJxuI
    acrossProductVector3(atomJ.labFrameDipole,      atomI.inducedDipoleP,     temp10 ); // _dJxuIp

    acrossProductVector3(atomI.inducedDipoleP,     qJr,     temp11 ); // _uIxqJrp
    acrossProductVector3(atomI.inducedDipole ,     qJr,     temp12 ); // _uIxqJr
    acrossProductVector3(deltaR,       qJuIp,   temp13 ); // _rxqJuIp
    acrossProductVector3(deltaR,       qJuI,    temp15 ); // _rxqJuI
Mark Friedrichs's avatar
Mark Friedrichs committed
512

Peter Eastman's avatar
Peter Eastman committed
513
514
515
516
517
518
    float ttm3_0 = rr3*temp1[0] + gf3*temp2[0] - gf6*temp3[0] - gf4*(temp4[0] + temp5[0] + temp6[0] - 2.0f*temp14[0]) - gf7*(temp7[0] - temp8[0]);
    float ttm3i_0 = -rr3*(temp9[0]*psc0+ temp10[0]*dsc0)*0.5f + gti3*temp2[0] - gti4*((temp12[0] + temp15[0])*psc1 + (temp11[0] + temp13[0])*dsc1)*0.5f - gti6*temp3[0];
    float ttm3_1 = rr3*temp1[1] + gf3*temp2[1] - gf6*temp3[1] - gf4*(temp4[1] + temp5[1] + temp6[1] - 2.0f*temp14[1]) - gf7*(temp7[1] - temp8[1]);
    float ttm3i_1 = -rr3*(temp9[1]*psc0+ temp10[1]*dsc0)*0.5f + gti3*temp2[1] - gti4*((temp12[1] + temp15[1])*psc1 + (temp11[1] + temp13[1])*dsc1)*0.5f - gti6*temp3[1];
    float ttm3_2 = rr3*temp1[2] + gf3*temp2[2] - gf6*temp3[2] - gf4*(temp4[2] + temp5[2] + temp6[2] - 2.0f*temp14[2]) - gf7*(temp7[2] - temp8[2]);
    float ttm3i_2 = -rr3*(temp9[2]*psc0+ temp10[2]*dsc0)*0.5f + gti3*temp2[2] - gti4*((temp12[2] + temp15[2])*psc1 + (temp11[2] + temp13[2])*dsc1)*0.5f - gti6*temp3[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
519
520
521

    if( scalingFactors[MScaleIndex] < 1.0f ){
    
522
523
524
        ftm2_0 *= scalingFactors[MScaleIndex];
        ftm2_1 *= scalingFactors[MScaleIndex];
        ftm2_2 *= scalingFactors[MScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
525
        
Peter Eastman's avatar
Peter Eastman committed
526
527
528
        ttm2_0 *= scalingFactors[MScaleIndex];
        ttm2_1 *= scalingFactors[MScaleIndex];
        ttm2_2 *= scalingFactors[MScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
529
        
Peter Eastman's avatar
Peter Eastman committed
530
531
532
        ttm3_0 *= scalingFactors[MScaleIndex];
        ttm3_1 *= scalingFactors[MScaleIndex];
        ttm3_2 *= scalingFactors[MScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
533
534
535
536
537
538
539
    
    }


#ifdef AMOEBA_DEBUG
if( 0 ){
int debugIndex               = 0;
540
541
542
543
    debugArray[debugIndex].x = scalingFactors[DScaleIndex];
    debugArray[debugIndex].y = scalingFactors[PScaleIndex];
    debugArray[debugIndex].z = scalingFactors[MScaleIndex];
    debugArray[debugIndex].w = scalingFactors[UScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
544
545

    debugIndex++;
546
547
548
549
    debugArray[debugIndex].x = ftm2i_0 + (fridmp_0 + findmp_0);
    debugArray[debugIndex].y = ftm2i_1 + (fridmp_1 + findmp_1);
    debugArray[debugIndex].z = ftm2i_2 + (fridmp_2 + findmp_2);
    debugArray[debugIndex].w = 1.5;
Mark Friedrichs's avatar
Mark Friedrichs committed
550

551
/*
Mark Friedrichs's avatar
Mark Friedrichs committed
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
    debugIndex++;
    debugArray[debugIndex].x = temp2[0];
    debugArray[debugIndex].y = temp2[1];
    debugArray[debugIndex].z = temp2[2];
    debugArray[debugIndex].w = 2.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp3[0];
    debugArray[debugIndex].y = temp3[1];
    debugArray[debugIndex].z = temp3[2];
    debugArray[debugIndex].w = 3.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp4[0];
    debugArray[debugIndex].y = temp4[1];
    debugArray[debugIndex].z = temp4[2];
    debugArray[debugIndex].w = 4.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp5[0];
    debugArray[debugIndex].y = temp5[1];
    debugArray[debugIndex].z = temp5[2];
    debugArray[debugIndex].w = 5.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp6[0];
    debugArray[debugIndex].y = temp6[1];
    debugArray[debugIndex].z = temp6[2];
    debugArray[debugIndex].w = 6.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp14[0];
    debugArray[debugIndex].y = temp14[1];
    debugArray[debugIndex].z = temp14[2];
    debugArray[debugIndex].w = 14.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp7[0];
    debugArray[debugIndex].y = temp7[1];
    debugArray[debugIndex].z = temp7[2];
    debugArray[debugIndex].w = 7.0f;


    debugIndex++;
    debugArray[debugIndex].x = temp8[0];
    debugArray[debugIndex].y = temp8[1];
    debugArray[debugIndex].z = temp8[2];
    debugArray[debugIndex].w = 8.0f;

    debugIndex++;
    debugArray[debugIndex].x = rr3;
Peter Eastman's avatar
Peter Eastman committed
603
604
    debugArray[debugIndex].y = gf3;
    debugArray[debugIndex].z = gf6;
Mark Friedrichs's avatar
Mark Friedrichs committed
605
606
607
    debugArray[debugIndex].w = 20.0f;

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
608
609
    debugArray[debugIndex].x = gf4;
    debugArray[debugIndex].y = gf7;
Mark Friedrichs's avatar
Mark Friedrichs committed
610
611
612
613
    debugArray[debugIndex].z = 0.0f;
    debugArray[debugIndex].w = 21.0f;

    debugIndex++;
614
615
616
    debugArray[debugIndex].x = atomJ.labFrameDipole[0];
    debugArray[debugIndex].y = atomJ.labFrameDipole[1];
    debugArray[debugIndex].z = atomJ.labFrameDipole[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
617
618
619
    debugArray[debugIndex].w = 22.0f;

    debugIndex++;
620
621
622
    debugArray[debugIndex].x = deltaR[0];
    debugArray[debugIndex].y = deltaR[1];
    debugArray[debugIndex].z = deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
623
    debugArray[debugIndex].w = 23.0f;
624
*/
Mark Friedrichs's avatar
Mark Friedrichs committed
625
626
627
628

}
#endif

629
630
631
    outputForce->x       = -(ftm2_0 + ftm2i_0);
    outputForce->y       = -(ftm2_1 + ftm2i_1);
    outputForce->z       = -(ftm2_2 + ftm2i_2);
Mark Friedrichs's avatar
Mark Friedrichs committed
632
    
633
634
635
    outputTorque[0].x    =  (ttm2_0 + ttm2i_0);
    outputTorque[0].y    =  (ttm2_1 + ttm2i_1);
    outputTorque[0].z    =  (ttm2_2 + ttm2i_2);
Mark Friedrichs's avatar
Mark Friedrichs committed
636

637
638
639
    outputTorque[1].x    =  (ttm3_0 + ttm3i_0);
    outputTorque[1].y    =  (ttm3_1 + ttm3i_1);
    outputTorque[1].z    =  (ttm3_2 + ttm3i_2);
Mark Friedrichs's avatar
Mark Friedrichs committed
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657

    return;

}

__device__ void loadElectrostaticShared( struct ElectrostaticParticle* sA, unsigned int atomI,
                                         float4* atomCoord, float* labFrameDipoleJ, float* labQuadrupole,
                                         float* inducedDipole, float* inducedDipolePolar, float2* dampingFactorAndThole )
{
    // coordinates & charge

    sA->x                        = atomCoord[atomI].x;
    sA->y                        = atomCoord[atomI].y;
    sA->z                        = atomCoord[atomI].z;
    sA->q                        = atomCoord[atomI].w;

    // lab dipole

658
659
660
    sA->labFrameDipole[0]         = labFrameDipoleJ[atomI*3];
    sA->labFrameDipole[1]         = labFrameDipoleJ[atomI*3+1];
    sA->labFrameDipole[2]         = labFrameDipoleJ[atomI*3+2];
Mark Friedrichs's avatar
Mark Friedrichs committed
661
662
663

    // lab quadrupole

664
665
666
667
668
669
670
671
672
    sA->labFrameQuadrupole[0]    = labQuadrupole[atomI*9];
    sA->labFrameQuadrupole[1]    = labQuadrupole[atomI*9+1];
    sA->labFrameQuadrupole[2]    = labQuadrupole[atomI*9+2];
    sA->labFrameQuadrupole[3]    = labQuadrupole[atomI*9+3];
    sA->labFrameQuadrupole[4]    = labQuadrupole[atomI*9+4];
    sA->labFrameQuadrupole[5]    = labQuadrupole[atomI*9+5];
    sA->labFrameQuadrupole[6]    = labQuadrupole[atomI*9+6];
    sA->labFrameQuadrupole[7]    = labQuadrupole[atomI*9+7];
    sA->labFrameQuadrupole[8]    = labQuadrupole[atomI*9+8];
Mark Friedrichs's avatar
Mark Friedrichs committed
673
674
675

    // induced dipole

676
677
678
    sA->inducedDipole[0]          = inducedDipole[atomI*3];
    sA->inducedDipole[1]          = inducedDipole[atomI*3+1];
    sA->inducedDipole[2]          = inducedDipole[atomI*3+2];
Mark Friedrichs's avatar
Mark Friedrichs committed
679
680
681

    // induced dipole polar

682
683
684
    sA->inducedDipoleP[0]         = inducedDipolePolar[atomI*3];
    sA->inducedDipoleP[1]         = inducedDipolePolar[atomI*3+1];
    sA->inducedDipoleP[2]         = inducedDipolePolar[atomI*3+2];
Mark Friedrichs's avatar
Mark Friedrichs committed
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707

    sA->damp                     = dampingFactorAndThole[atomI].x;
    sA->thole                    = dampingFactorAndThole[atomI].y;

}

// Include versions of the kernels for N^2 calculations.

#undef USE_OUTPUT_BUFFER_PER_WARP
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateAmoebaCudaElectrostatic.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaElectrostatic.h"

// reduce psWorkArray_3_1 -> force
// reduce psWorkArray_3_2 -> torque

static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
{
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
708
                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psForce->_pDevData );
709
    LAUNCHERROR("kReduceElectrostaticForce");
Mark Friedrichs's avatar
Mark Friedrichs committed
710
711
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
712
                               amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psTorque->_pDevData );
713
    LAUNCHERROR("kReduceElectrostaticTorque");
Mark Friedrichs's avatar
Mark Friedrichs committed
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
}


/**---------------------------------------------------------------------------------------

   Compute Amoeba electrostatic force & torque

   @param amoebaGpu        amoebaGpu context
   @param gpu              OpenMM gpu Cuda context

   --------------------------------------------------------------------------------------- */

void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
{
  
   // ---------------------------------------------------------------------------------------

Peter Eastman's avatar
Peter Eastman committed
731
732
    static unsigned int threadsPerBlock = 0;

Mark Friedrichs's avatar
Mark Friedrichs committed
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
#ifdef AMOEBA_DEBUG
    static const char* methodName = "cudaComputeAmoebaElectrostatic";
    static int timestep = 0;
    std::vector<int> fileId;
    timestep++;
    fileId.resize( 2 );
    fileId[0] = timestep;
    fileId[1] = 1;
#endif

    // ---------------------------------------------------------------------------------------

    gpuContext gpu = amoebaGpu->gpuContext;

    // apparently debug array can take up nontrivial no. registers

#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
Mark Friedrichs's avatar
Mark Friedrichs committed
751
752
        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
Mark Friedrichs's avatar
Mark Friedrichs committed
753
    }   
754
755
756
757
    static const int maxSlots                 =20;
    int paddedNumberOfAtoms                   = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
Mark Friedrichs's avatar
Mark Friedrichs committed
758
    debugArray->Upload();
759
    unsigned int targetAtom                   = 237;
Mark Friedrichs's avatar
Mark Friedrichs committed
760
761
#endif

Peter Eastman's avatar
Peter Eastman committed
762
763
764
765
766
    // on first pass, set threads/block

    if( threadsPerBlock == 0 ){
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
Peter Eastman's avatar
Peter Eastman committed
767
            maxThreads = 384;
Peter Eastman's avatar
Peter Eastman committed
768
769
770
771
        else if (gpu->sm_version >= SM_12)
            maxThreads = 128;
        else
            maxThreads = 64;
Peter Eastman's avatar
Peter Eastman committed
772
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
Peter Eastman's avatar
Peter Eastman committed
773
774
    }

Mark Friedrichs's avatar
Mark Friedrichs committed
775
    kClearFields_3( amoebaGpu, 2 );
776
    LAUNCHERROR("kClearFields_3 kCalculateAmoebaCudaElectrostatic");
Mark Friedrichs's avatar
Mark Friedrichs committed
777
778
779

    if (gpu->bOutputBufferPerWarp){

Mark Friedrichs's avatar
Mark Friedrichs committed
780
#ifdef AMOEBA_DEBUG
781
782
783
784
785
        if( amoebaGpu->log ){
            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
                            amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                            sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log );
        }
Mark Friedrichs's avatar
Mark Friedrichs committed
786
#endif
787
788


Peter Eastman's avatar
Peter Eastman committed
789
        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
790
791
792
793
794
795
796
                                                                           amoebaGpu->psWorkUnit->_pDevData,
                                                                           gpu->psPosq4->_pDevData,
                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
                                                                           amoebaGpu->psInducedDipole->_pDevData,
                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
Mark Friedrichs's avatar
Mark Friedrichs committed
797
#ifdef AMOEBA_DEBUG
798
799
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
Mark Friedrichs's avatar
Mark Friedrichs committed
800
#else
801
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
802
803
804
805
806
#endif

    } else {

#ifdef AMOEBA_DEBUG
807
808
809
810
811
812
        if( amoebaGpu->log ){
            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u xnCt=%u workUnits=%u\n",
                            amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                            sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
            (void) fflush( amoebaGpu->log );
        }
Mark Friedrichs's avatar
Mark Friedrichs committed
813
814
#endif

Peter Eastman's avatar
Peter Eastman committed
815
        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
816
817
818
819
820
821
822
                                                                           amoebaGpu->psWorkUnit->_pDevData,
                                                                           gpu->psPosq4->_pDevData,
                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
                                                                           amoebaGpu->psInducedDipole->_pDevData,
                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
Mark Friedrichs's avatar
Mark Friedrichs committed
823
#ifdef AMOEBA_DEBUG
824
825
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
Mark Friedrichs's avatar
Mark Friedrichs committed
826
#else
827
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
828
829
830
831
832
#endif
    }
    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");

    kReduceForceTorque( amoebaGpu );
833
    LAUNCHERROR("kReduceForceTorque");
Mark Friedrichs's avatar
Mark Friedrichs committed
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852

#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

        amoebaGpu->psForce->Download();
        amoebaGpu->psTorque->Download();
        debugArray->Download();

        (void) fprintf( amoebaGpu->log, "Finished Electrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );

        int maxPrint        = 1400;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 

            int indexOffset     = ii*3;
    
           // force

           (void) fprintf( amoebaGpu->log,"ElectrostaticF [%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
853
854
855
                           amoebaGpu->psForce->_pSysData[indexOffset],
                           amoebaGpu->psForce->_pSysData[indexOffset+1],
                           amoebaGpu->psForce->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
856
857
858
859
    
           // torque

           (void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
860
861
862
                           amoebaGpu->psTorque->_pSysData[indexOffset],
                           amoebaGpu->psTorque->_pSysData[indexOffset+1],
                           amoebaGpu->psTorque->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
863
864
865
866
867
868
869
870
871
872
873

           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
                ii = gpu->natoms - maxPrint;
           }
        }
        if( 1 ){
            (void) fprintf( amoebaGpu->log,"DebugElec\n" );
            int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
            for( int jj = 0; jj < gpu->natoms; jj++ ){
                int debugIndex = jj;
874
875
                for( int kk = 0; kk < 8; kk++ ){
                    float conversion = kk >= 1 && kk <= 8 ? 1.0f/4.184f : 1.0;
Mark Friedrichs's avatar
Mark Friedrichs committed
876
                    (void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
877
878
                                    conversion*debugArray->_pSysData[debugIndex].x, conversion*debugArray->_pSysData[debugIndex].y,
                                    conversion*debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
Mark Friedrichs's avatar
Mark Friedrichs committed
879
880
881
882
883
                    debugIndex += paddedNumberOfAtoms;
                }
                (void) fprintf( amoebaGpu->log,"\n" );
            }
        }
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
        if( 1 ){
            (void) fprintf( amoebaGpu->log,"DebugElec\n" );
            int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
            for( int jj = 0; jj < gpu->natoms; jj++ ){
                int debugIndex1 = jj + paddedNumberOfAtoms;
                int debugIndex2 = jj + 5*paddedNumberOfAtoms;
                int debugIndex3 = jj + 6*paddedNumberOfAtoms;
                int debugIndex4 = jj + 4*paddedNumberOfAtoms;
                int debugIndex5 = jj + 7*paddedNumberOfAtoms;
                float conversion = 1.0f/4.184f;
                int i1,i2;
                if( jj < targetAtom ){
                    i1 = jj;
                    i2 = targetAtom;
                } else {
                    i1 = targetAtom;
                    i2 = jj;
                }
                (void) fprintf( amoebaGpu->log,"%5d %5d %16.9e %16.9e %16.9e    %16.9e %16.9e %16.9e   %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e F11\n", i1,i2,
                                conversion*debugArray->_pSysData[debugIndex1].x,
                                conversion*debugArray->_pSysData[debugIndex1].y,
                                conversion*debugArray->_pSysData[debugIndex1].z, 
                                conversion*debugArray->_pSysData[debugIndex2].x,
                                conversion*debugArray->_pSysData[debugIndex2].y,
                                conversion*debugArray->_pSysData[debugIndex2].z, 
                                conversion*debugArray->_pSysData[debugIndex3].x,
                                conversion*debugArray->_pSysData[debugIndex3].y,
                                conversion*debugArray->_pSysData[debugIndex3].z,
                                conversion*debugArray->_pSysData[debugIndex5].x,
                                conversion*debugArray->_pSysData[debugIndex5].y,
                                conversion*debugArray->_pSysData[debugIndex5].z );
            }
        }
Mark Friedrichs's avatar
Mark Friedrichs committed
917
918
919
920
921
922
923
924
925
926
927
928
        (void) fflush( amoebaGpu->log );

        if( 0 ){
            (void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
            int maxPrint = 12;
            for( int ii = 0; ii < gpu->natoms; ii++ ){
    
                // print cpu & gpu reductions
    
                int offset  = 3*ii;
    
                (void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
Mark Friedrichs's avatar
Mark Friedrichs committed
929
930
931
932
933
934
                                amoebaGpu->psForce->_pSysData[offset],
                                amoebaGpu->psForce->_pSysData[offset+1],
                                amoebaGpu->psForce->_pSysData[offset+2],
                                amoebaGpu->psTorque->_pSysData[offset],
                                amoebaGpu->psTorque->_pSysData[offset+1],
                                amoebaGpu->psTorque->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
935
936
937
938
939
940
941
942
                if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint; 
            }   
        }   

        if( 1 ){
            std::vector<int> fileId;
            //fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;
943
944
945
            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, NULL, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector, NULL, 1.0f );
Mark Friedrichs's avatar
Mark Friedrichs committed
946
947
948
949
950
951
952
953
            cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
         }

    }   
    delete debugArray;

#endif

954
955
956
957
958
959
960
961
962
963
    if( 0 ){
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, NULL, 1.0f/4.184 );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector, NULL, 1.0f/4.184 );
        cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
     }

Mark Friedrichs's avatar
Mark Friedrichs committed
964
965
   // ---------------------------------------------------------------------------------------
}