kCalculateAmoebaCudaElectrostatic.cu 36 KB
Newer Older
Mark Friedrichs's avatar
Mark Friedrichs committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
//-----------------------------------------------------------------------------------------

//-----------------------------------------------------------------------------------------

#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"

//#define AMOEBA_DEBUG

static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

void SetCalculateAmoebaElectrostaticSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}

void GetCalculateAmoebaElectrostaticSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}

static int const PScaleIndex            =  0; 
static int const DScaleIndex            =  1; 
static int const UScaleIndex            =  2; 
static int const MScaleIndex            =  3;
static int const Scale3Index            =  4;
static int const Scale5Index            =  5;
static int const Scale7Index            =  6;
static int const Scale9Index            =  7;
static int const Ddsc30Index            =  8;
//static int const Ddsc31Index            =  9;
//static int const Ddsc32Index            = 10; 
static int const Ddsc50Index            = 11;
//static int const Ddsc51Index            = 12;
//static int const Ddsc52Index            = 13; 
static int const Ddsc70Index            = 14;
//static int const Ddsc71Index            = 15;
//static int const Ddsc72Index            = 16;
51
52
53
54
55
56
57
static int const LastScalingIndex       = 17;

#define DOT3_4(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))

#define MATRIXDOT31(u,v) u[0]*v[0] + u[1]*v[1] + u[2]*v[2] + \
  u[3]*v[3] + u[4]*v[4] + u[5]*v[5] + \
  u[6]*v[6] + u[7]*v[7] + u[8]*v[8]
Mark Friedrichs's avatar
Mark Friedrichs committed
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

#define DOT31(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))

#define i35 0.257142857f
#define one 1.0f

__device__ void acrossProductVector3(   float* vectorX, float* vectorY, float* vectorZ ){
    vectorZ[0]  = vectorX[1]*vectorY[2] - vectorX[2]*vectorY[1];
    vectorZ[1]  = vectorX[2]*vectorY[0] - vectorX[0]*vectorY[2];
    vectorZ[2]  = vectorX[0]*vectorY[1] - vectorX[1]*vectorY[0];
}

__device__ void amatrixProductVector3(   float* matrixX, float* vectorY, float* vectorZ ){
    vectorZ[0]  = matrixX[0]*vectorY[0] + matrixX[3]*vectorY[1] + matrixX[6]*vectorY[2];
    vectorZ[1]  = matrixX[1]*vectorY[0] + matrixX[4]*vectorY[1] + matrixX[7]*vectorY[2];
    vectorZ[2]  = matrixX[2]*vectorY[0] + matrixX[5]*vectorY[1] + matrixX[8]*vectorY[2];
}

__device__ void amatrixCrossProductMatrix3( float* matrixX, float* matrixY, float* vectorZ ){
  
    float* xPtr[3];
    float* yPtr[3];
        
    xPtr[0]    = matrixX;
    xPtr[1]    = matrixX + 3;
    xPtr[2]    = matrixX + 6;
    
    yPtr[0]    = matrixY;
    yPtr[1]    = matrixY + 3;
    yPtr[2]    = matrixY + 6;
          
    vectorZ[0] = DOT31( xPtr[1], yPtr[2] ) - DOT31( xPtr[2], yPtr[1] );
    vectorZ[1] = DOT31( xPtr[2], yPtr[0] ) - DOT31( xPtr[0], yPtr[2] );
    vectorZ[2] = DOT31( xPtr[0], yPtr[1] ) - DOT31( xPtr[1], yPtr[0] );
  
}

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
struct ElectrostaticParticle {

    // coordinates charge

    float x;
    float y;
    float z;
    float q;

    // lab frame dipole

    float labFrameDipole[3];

    // lab frame quadrupole

    float labFrameQuadrupole[9];

    // induced dipole

    float inducedDipole[3];

    // polar induced dipole

    float inducedDipoleP[3];

    // scaling factors

    float thole;
    float damp;

    float force[3];

    float torque[3];
    float padding;

};

__device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& atomI,   ElectrostaticParticle& atomJ,
133
                                                      float* scalingFactors, float4*  outputForce, float4  outputTorque[2]
Mark Friedrichs's avatar
Mark Friedrichs committed
134
135
136
137
138
#ifdef AMOEBA_DEBUG
                                                      ,float4* debugArray 
#endif
 ){
  
139
    float deltaR[3];
Mark Friedrichs's avatar
Mark Friedrichs committed
140
141
142
143
144
145
146
147
148
    
    // ---------------------------------------------------------------------------------------
    
    // ---------------------------------------------------------------------------------------

    float* ddsc3                    =  scalingFactors + Ddsc30Index;
    float* ddsc5                    =  scalingFactors + Ddsc50Index;
    float* ddsc7                    =  scalingFactors + Ddsc70Index;

149
150
151
    deltaR[0]                       = atomJ.x - atomI.x;
    deltaR[1]                       = atomJ.y - atomI.y;
    deltaR[2]                       = atomJ.z - atomI.z;
Mark Friedrichs's avatar
Mark Friedrichs committed
152
153
154
155
156
157
158
159
160
161
162
163
164

    float r2                        = DOT31( deltaR, deltaR );
    float r                         = sqrtf( r2 );
    float rr1                       = 1.0f/r;
    float rr2                       = rr1*rr1;
    float rr3                       = rr1*rr2;
    float rr5                       = 3.0f*rr3*rr2;
    float rr7                       = 5.0f*rr5*rr2;
    float rr9                       = 7.0f*rr7*rr2;
    float rr11                      = 9.0f*rr9*rr2;

    //-------------------------------------------

165
    if( atomI.damp != 0.0f && atomJ.damp != 0.0 && r < cAmoebaSim.scalingDistanceCutoff ){
Mark Friedrichs's avatar
Mark Friedrichs committed
166
167
168
169
170
   
        float distanceIJ, r2I;
        distanceIJ                    = r;
        r2I                           = rr2;
        
171
172
        float ratio                   = distanceIJ/(atomI.damp*atomJ.damp);
        float pGamma                  = atomJ.thole > atomI.thole ? atomI.thole : atomJ.thole;
Mark Friedrichs's avatar
Mark Friedrichs committed
173

174
        float damp                    = ratio*ratio*ratio*pGamma;
Mark Friedrichs's avatar
Mark Friedrichs committed
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
        float dampExp                 = expf( -damp );
        float damp1                   = damp + one;
        float damp2                   = damp*damp;
        float damp3                   = damp2*damp;

        scalingFactors[Scale3Index]   = one - dampExp;
        scalingFactors[Scale5Index]   = one - damp1*dampExp;
        scalingFactors[Scale7Index]   = one - ( damp1 + 0.6f*damp2)*dampExp;
        scalingFactors[Scale9Index]   = one - ( damp1 + ( 2.0f*damp2 + damp3 )*i35)*dampExp;

        float factor                  = 3.0f*damp*dampExp*r2I;
        float factor7                 = -0.2f + 0.6f*damp;
        
        for( int ii = 0; ii < 3; ii++ ){
            scalingFactors[Ddsc30Index + ii] = factor*deltaR[ii];
            scalingFactors[Ddsc50Index + ii] = scalingFactors[Ddsc30Index + ii]*damp;
            scalingFactors[Ddsc70Index + ii] = scalingFactors[Ddsc50Index + ii]*factor7;
        }

    }
      
Peter Eastman's avatar
Peter Eastman committed
196
197
198
199
200
201
202
203
    float scaleI0 = scalingFactors[Scale3Index]*scalingFactors[UScaleIndex];
    float dsc0    = scalingFactors[Scale3Index]*scalingFactors[DScaleIndex];
    float psc0    = scalingFactors[Scale3Index]*scalingFactors[PScaleIndex];
    float scaleI1 = scalingFactors[Scale3Index+1]*scalingFactors[UScaleIndex];
    float dsc1    = scalingFactors[Scale3Index+1]*scalingFactors[DScaleIndex];
    float psc1    = scalingFactors[Scale3Index+1]*scalingFactors[PScaleIndex];
    float dsc2    = scalingFactors[Scale3Index+2]*scalingFactors[DScaleIndex];
    float psc2    = scalingFactors[Scale3Index+2]*scalingFactors[PScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
204
                       
205
    float qIr[3], qJr[3];
Mark Friedrichs's avatar
Mark Friedrichs committed
206

207
208
    amatrixProductVector3( atomJ.labFrameQuadrupole,      deltaR,      qJr);
    amatrixProductVector3( atomI.labFrameQuadrupole,      deltaR,      qIr);
Mark Friedrichs's avatar
Mark Friedrichs committed
209

Peter Eastman's avatar
Peter Eastman committed
210
211
212
    float sc2     = DOT3_4(        atomI.labFrameDipole,  atomJ.labFrameDipole );
    float sc3     = DOT3_4(        atomI.labFrameDipole,  deltaR  );
    float sc4     = DOT3_4(        atomJ.labFrameDipole,  deltaR  );
Mark Friedrichs's avatar
Mark Friedrichs committed
213
    
Peter Eastman's avatar
Peter Eastman committed
214
215
    float sc5     = DOT3_4(        qIr, deltaR  );
    float sc6     = DOT3_4(        qJr, deltaR  );
Mark Friedrichs's avatar
Mark Friedrichs committed
216
    
Peter Eastman's avatar
Peter Eastman committed
217
218
    float sc7     = DOT3_4(        qIr, atomJ.labFrameDipole );
    float sc8     = DOT3_4(        qJr, atomI.labFrameDipole );
Mark Friedrichs's avatar
Mark Friedrichs committed
219
    
Peter Eastman's avatar
Peter Eastman committed
220
    float sc9     = DOT3_4(        qIr, qJr );
Mark Friedrichs's avatar
Mark Friedrichs committed
221
    
Peter Eastman's avatar
Peter Eastman committed
222
    float sc10    = MATRIXDOT31( atomI.labFrameQuadrupole, atomJ.labFrameQuadrupole );
Mark Friedrichs's avatar
Mark Friedrichs committed
223
    
Peter Eastman's avatar
Peter Eastman committed
224
225
226
227
228
    float sci1    = DOT3_4(        atomI.inducedDipole,  atomJ.labFrameDipole ) +
                    DOT3_4(        atomJ.inducedDipole,  atomI.labFrameDipole );
        
    float sci3    = DOT3_4(        atomI.inducedDipole,  deltaR  );
    float sci4    = DOT3_4(        atomJ.inducedDipole,  deltaR  );
Mark Friedrichs's avatar
Mark Friedrichs committed
229
    
Peter Eastman's avatar
Peter Eastman committed
230
231
    float sci7    = DOT3_4(        qIr, atomJ.inducedDipole );
    float sci8    = DOT3_4(        qJr, atomI.inducedDipole );
Mark Friedrichs's avatar
Mark Friedrichs committed
232
    
Peter Eastman's avatar
Peter Eastman committed
233
234
    float scip1   = DOT3_4(        atomI.inducedDipoleP, atomJ.labFrameDipole ) +
                    DOT3_4(        atomJ.inducedDipoleP, atomI.labFrameDipole );
Mark Friedrichs's avatar
Mark Friedrichs committed
235
    
Peter Eastman's avatar
Peter Eastman committed
236
237
    float scip2   = DOT3_4(        atomI.inducedDipole,  atomJ.inducedDipoleP) +
                    DOT3_4(        atomJ.inducedDipole,  atomI.inducedDipoleP);
Mark Friedrichs's avatar
Mark Friedrichs committed
238
    
Peter Eastman's avatar
Peter Eastman committed
239
240
    float scip3   = DOT3_4(        atomI.inducedDipoleP, deltaR );
    float scip4   = DOT3_4(        atomJ.inducedDipoleP, deltaR );
Mark Friedrichs's avatar
Mark Friedrichs committed
241
    
Peter Eastman's avatar
Peter Eastman committed
242
243
    float scip7   = DOT3_4(        qIr, atomJ.inducedDipoleP );
    float scip8   = DOT3_4(        qJr, atomI.inducedDipoleP );
Mark Friedrichs's avatar
Mark Friedrichs committed
244

245
246
247
248
249
250
    float scaleF             = 0.5f*scalingFactors[UScaleIndex];
    float inducedFactor3     = scip2*rr3*scaleF;
    float inducedFactor5     = (sci3*scip4+scip3*sci4)*rr5*scaleF;
    float findmp_0           = inducedFactor3*ddsc3[0] - inducedFactor5*ddsc5[0];
    float findmp_1           = inducedFactor3*ddsc3[1] - inducedFactor5*ddsc5[1];
    float findmp_2           = inducedFactor3*ddsc3[2] - inducedFactor5*ddsc5[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
251

Peter Eastman's avatar
Peter Eastman committed
252
253
254
255
256
    float gli1               = atomJ.q*sci3 - atomI.q*sci4;
    float gli2               = -sc3*sci4 - sci3*sc4;
    float gli3               = sci3*sc6 - sci4*sc5;
    float gli6               = sci1;
    float gli7               = 2.0f*(sci7-sci8);
Mark Friedrichs's avatar
Mark Friedrichs committed
257
    
Peter Eastman's avatar
Peter Eastman committed
258
259
260
261
262
    float glip1              = atomJ.q*scip3 - atomI.q*scip4;
    float glip2              = -sc3*scip4 - scip3*sc4;
    float glip3              = scip3*sc6 - scip4*sc5;
    float glip6              = scip1;
    float glip7              = 2.0f*(scip7-scip8);
Mark Friedrichs's avatar
Mark Friedrichs committed
263
    
264
265
266
    float factor3            = rr3*(( gli1  +  gli6)*scalingFactors[PScaleIndex] + (glip1  + glip6)*scalingFactors[DScaleIndex]);
    float factor5            = rr5*(( gli2  +  gli7)*scalingFactors[PScaleIndex] + (glip2  + glip7)*scalingFactors[DScaleIndex]);
    float factor7            = rr7*( gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
Mark Friedrichs's avatar
Mark Friedrichs committed
267
      
268
269
270
    float fridmp_0           = 0.5f*(factor3*ddsc3[0] + factor5*ddsc5[0] + factor7*ddsc7[0]);
    float fridmp_1           = 0.5f*(factor3*ddsc3[1] + factor5*ddsc5[1] + factor7*ddsc7[1]);
    float fridmp_2           = 0.5f*(factor3*ddsc3[2] + factor5*ddsc5[2] + factor7*ddsc7[2]);
Mark Friedrichs's avatar
Mark Friedrichs committed
271
      
Peter Eastman's avatar
Peter Eastman committed
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
    float gl0 = atomI.q*atomJ.q;
    float gl1 = atomJ.q*sc3 - atomI.q*sc4;
    float gl2 = atomI.q*sc6 + atomJ.q*sc5 - sc3*sc4;
    float gl3 = sc3*sc6 - sc4*sc5;
    float gl4 = sc5*sc6;
    float gl6 = sc2;
    float gl7 = 2.0f*(sc7-sc8);
    float gl8 = 2.0f*sc10;
    float gl5 = -4.0f*sc9;
    
    float gf1 = rr3*gl0 + rr5*(gl1+gl6) + rr7*(gl2+gl7+gl8) + rr9*(gl3+gl5) + rr11*gl4;
    float gf2 = -atomJ.q*rr3 + sc4*rr5 - sc6*rr7;
    float gf3 =  atomI.q*rr3 + sc3*rr5 + sc5*rr7;
    float gf4 = 2.0f*rr5;
    float gf5 = 2.0f*(-atomJ.q*rr5+sc4*rr7-sc6*rr9);
    float gf6 = 2.0f*(-atomI.q*rr5-sc3*rr7-sc5*rr9);
    float gf7 = 4.0f*rr7;
Mark Friedrichs's avatar
Mark Friedrichs committed
289
290
291

    // energy

Peter Eastman's avatar
Peter Eastman committed
292
293
    float em                 = scalingFactors[MScaleIndex]*(rr1*gl0 + rr3*(gl1+gl6) + rr5*(gl2+gl7+gl8) + rr7*(gl3+gl5) + rr9*gl4);
    float ei                 = 0.5f*(rr3*(gli1+gli6)*psc0 + rr5*(gli2+gli7)*psc1 + rr7*gli3*psc2);
294
    outputForce->w           = em+ei;
Mark Friedrichs's avatar
Mark Friedrichs committed
295
296
297
298
299
    
#ifdef AMOEBA_DEBUG
#if 0
if( 1 ){
    int debugIndex           = 0;
300
301
    debugArray[debugIndex].x = em;
    debugArray[debugIndex].y = ei;
Mark Friedrichs's avatar
Mark Friedrichs committed
302
303
304
305
    debugArray[debugIndex].z = rr1;
    debugArray[debugIndex].w = rr3;

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
306
307
308
309
    debugArray[debugIndex].x = gl0;
    debugArray[debugIndex].y = gl1;
    debugArray[debugIndex].z = gl6;
    debugArray[debugIndex].w = gl2;
Mark Friedrichs's avatar
Mark Friedrichs committed
310
311

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
312
313
314
315
    debugArray[debugIndex].x = gli1;
    debugArray[debugIndex].y = gli3;
    debugArray[debugIndex].z = gli2;
    debugArray[debugIndex].w = gli7;
Mark Friedrichs's avatar
Mark Friedrichs committed
316
317

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
318
319
320
    debugArray[debugIndex].x = psc0;
    debugArray[debugIndex].y = psc1;
    debugArray[debugIndex].z = psc2;
Mark Friedrichs's avatar
Mark Friedrichs committed
321
322
323
324
325
326
327
    debugArray[debugIndex].w = scalingFactors[MScaleIndex];

}
#endif
#endif

    float temp1[3],temp2[3],temp3[3];
328
329
330
    float qIqJr[3], qJqIr[3], qIdJ[3], qJdI[3];
    amatrixProductVector3( atomI.labFrameQuadrupole,      atomJ.labFrameDipole,     qIdJ );//MK
    amatrixProductVector3( atomJ.labFrameQuadrupole,      atomI.labFrameDipole,     qJdI );//MK
Mark Friedrichs's avatar
Mark Friedrichs committed
331

332
333
334
335
    amatrixProductVector3( atomI.labFrameQuadrupole,      qJr,    qIqJr );//MK
    amatrixProductVector3( atomJ.labFrameQuadrupole,      qIr,    qJqIr );//MK
    amatrixProductVector3( atomJ.labFrameQuadrupole,      qIr,    temp1 );
    amatrixProductVector3( atomJ.labFrameQuadrupole,      atomI.labFrameDipole,     temp2 );
Mark Friedrichs's avatar
Mark Friedrichs committed
336

337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
    float ftm2_0 = gf1*deltaR[0] +
                     gf2*atomI.labFrameDipole[0] + gf3*atomJ.labFrameDipole[0]  +
                     gf4*(temp2[0]  - qIdJ[0])   +
                     gf5*qIr[0]    + gf6*qJr[0]  +
                     gf7*(qIqJr[0] + temp1[0]);
    
    float ftm2_1 = gf1*deltaR[1]                 +
                     gf2*atomI.labFrameDipole[1] + gf3*atomJ.labFrameDipole[1]  +
                     gf4*(temp2[1]  - qIdJ[1])   +
                     gf5*qIr[1]    + gf6*qJr[1]  +
                     gf7*(qIqJr[1] + temp1[1]);
    
    float ftm2_2 = gf1*deltaR[2]                 +
                     gf2*atomI.labFrameDipole[2] + gf3*atomJ.labFrameDipole[2]  +
                     gf4*(temp2[2]  - qIdJ[2])   +
                     gf5*qIr[2]    + gf6*qJr[2]  +
                     gf7*(qIqJr[2] + temp1[2]);
Mark Friedrichs's avatar
Mark Friedrichs committed
354
355
356
357
358
359
    

    // get the induced force;

    // intermediate variables for the induced-permanent terms;
    
Peter Eastman's avatar
Peter Eastman committed
360
361
362
363
364
    float gfi1 = rr5*0.5f*((gli1+gli6)*psc0 + (glip1+glip6)*dsc0 + scip2*scaleI0) + rr7*((gli7+gli2)*psc1 + (glip7+glip2)*dsc1 -
                                                       (sci3*scip4+scip3*sci4)*scaleI1)*0.5f + rr9*(gli3*psc2+glip3*dsc2)*0.5f;
    float gfi4 = 2.0f*rr5;
    float gfi5 = rr7* (sci4*psc2 + scip4*dsc2);
    float gfi6 = -rr7*(sci3*psc2 + scip3*dsc2);
Mark Friedrichs's avatar
Mark Friedrichs committed
365
366
367
368
369
370
371
372
373
374
375
376
377
378


    float temp4[3];
    float temp5[3];
    float temp6[3];
    float temp7[3];
    float temp8[3];
    float temp9[3];
    float temp10[3];
    float temp11[3];
    float temp12[3];
    float temp13[3];
    float temp14[3];
    float temp15[3];
379
380
    float qIuJp[3], qJuIp[3];
    float qIuJ[3], qJuI[3];
Mark Friedrichs's avatar
Mark Friedrichs committed
381

382
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipoleP,    temp4);
Mark Friedrichs's avatar
Mark Friedrichs committed
383

384
385
386
    amatrixProductVector3(atomI.labFrameQuadrupole,      atomJ.inducedDipoleP,    qIuJp);//MK
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipoleP,    qJuIp);//MK
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipole ,    qJuI);//MK
Mark Friedrichs's avatar
Mark Friedrichs committed
387

388
389
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipole,    temp5);
    amatrixProductVector3(atomI.labFrameQuadrupole,      atomJ.inducedDipole ,     qIuJ);//MK
Mark Friedrichs's avatar
Mark Friedrichs committed
390

391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
    float ftm2i_0 = gfi1*deltaR[0] +
                    0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[0]*psc0 + atomI.inducedDipoleP[0]*dsc0) +
                    rr5*sc4*(atomI.inducedDipole[0]*psc1 + atomI.inducedDipoleP[0]*dsc1) -
                    rr7*sc6*(atomI.inducedDipole[0]*psc2 + atomI.inducedDipoleP[0]*dsc2)) +
      
                   (rr3*atomI.q*(atomJ.inducedDipole[0]*psc0+atomJ.inducedDipoleP[0]*dsc0) +
                     rr5*sc3*(atomJ.inducedDipole[0]*psc1 +atomJ.inducedDipoleP[0]*dsc1) +
                     rr7*sc5*(atomJ.inducedDipole[0]*psc2 +atomJ.inducedDipoleP[0]*dsc2))*0.5f +
                     rr5*scaleI1*(sci4*atomI.inducedDipoleP[0]+scip4*atomI.inducedDipole[0] +
                     sci3*atomJ.inducedDipoleP[0]+scip3*atomJ.inducedDipole[0])*0.5f +
      
                    0.5f*(sci4*psc1+scip4*dsc1)*rr5*atomI.labFrameDipole[0] +
                    0.5f*(sci3*psc1+scip3*dsc1)*rr5*atomJ.labFrameDipole[0] +
                    0.5f*gfi4*((temp5[0]-qIuJ[0])*psc1 +
                    (temp4[0]-qIuJp[0])*dsc1) + gfi5*qIr[0] + gfi6*qJr[0];
      
    float ftm2i_1  = gfi1*deltaR[1] +
                    0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[1]*psc0 + atomI.inducedDipoleP[1]*dsc0) +
                    rr5*sc4*(atomI.inducedDipole[1]*psc1 + atomI.inducedDipoleP[1]*dsc1) -
                    rr7*sc6*(atomI.inducedDipole[1]*psc2 + atomI.inducedDipoleP[1]*dsc2)) +
      
                    (rr3*atomI.q*(atomJ.inducedDipole[1]*psc0+atomJ.inducedDipoleP[1]*dsc0) +
                     rr5*sc3*(atomJ.inducedDipole[1]*psc1 +atomJ.inducedDipoleP[1]*dsc1) +
                     rr7*sc5*(atomJ.inducedDipole[1]*psc2 +atomJ.inducedDipoleP[1]*dsc2))*0.5f +
                     rr5*scaleI1*(sci4*atomI.inducedDipoleP[1]+scip4*atomI.inducedDipole[1] +
                     sci3*atomJ.inducedDipoleP[1]+scip3*atomJ.inducedDipole[1])*0.5f +
      
                    0.5f*(sci4*psc1+scip4*dsc1)*rr5*atomI.labFrameDipole[1] +
                    0.5f*(sci3*psc1+scip3*dsc1)*rr5*atomJ.labFrameDipole[1] +
                    0.5f*gfi4*((temp5[1]-qIuJ[1])*psc1 +
                    (temp4[1]-qIuJp[1])*dsc1) + gfi5*qIr[1] + gfi6*qJr[1];
      
    float ftm2i_2  = gfi1*deltaR[2] +
                    0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[2]*psc0 + atomI.inducedDipoleP[2]*dsc0) +
                    rr5*sc4*(atomI.inducedDipole[2]*psc1 + atomI.inducedDipoleP[2]*dsc1) -
                    rr7*sc6*(atomI.inducedDipole[2]*psc2 + atomI.inducedDipoleP[2]*dsc2)) +
      
                    (rr3*atomI.q*(atomJ.inducedDipole[2]*psc0+atomJ.inducedDipoleP[2]*dsc0) +
                     rr5*sc3*(atomJ.inducedDipole[2]*psc1 +atomJ.inducedDipoleP[2]*dsc1) +
                     rr7*sc5*(atomJ.inducedDipole[2]*psc2 +atomJ.inducedDipoleP[2]*dsc2))*0.5f +
                     rr5*scaleI1*(sci4*atomI.inducedDipoleP[2]+scip4*atomI.inducedDipole[2] +
                     sci3*atomJ.inducedDipoleP[2]+scip3*atomJ.inducedDipole[2])*0.5f +
      
                    0.5f*(sci4*psc1+scip4*dsc1)*rr5*atomI.labFrameDipole[2] +
                    0.5f*(sci3*psc1+scip3*dsc1)*rr5*atomJ.labFrameDipole[2] +
                    0.5f*gfi4*((temp5[2]-qIuJ[2])*psc1 +
                    (temp4[2]-qIuJp[2])*dsc1) + gfi5*qIr[2] + gfi6*qJr[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
438
439
440
441

    // handle of scaling for partially excluded interactions;
    // correction to convert mutual to direct polarization force;
    
442
443
444
    ftm2i_0 -= (fridmp_0 + findmp_0);
    ftm2i_1 -= (fridmp_1 + findmp_1);
    ftm2i_2 -= (fridmp_2 + findmp_2);
Mark Friedrichs's avatar
Mark Friedrichs committed
445
    
446
447
448
449
450
451
452
453
454
455
456
457
    if( cAmoebaSim.polarizationType )
    {
        float gfd     = 0.5*(rr5*scip2*scaleI0 - rr7*(scip3*sci4+sci3*scip4)*scaleI1);
        float temp5   = 0.5*rr5*scaleI1;
        float fdir_0  = gfd*deltaR[0] + temp5*(sci4*atomI.inducedDipoleP[0] + scip4*atomI.inducedDipole[0] + sci3*atomJ.inducedDipoleP[0] + scip3*atomJ.inducedDipole[0]);
        float fdir_1  = gfd*deltaR[1] + temp5*(sci4*atomI.inducedDipoleP[1] + scip4*atomI.inducedDipole[1] + sci3*atomJ.inducedDipoleP[1] + scip3*atomJ.inducedDipole[1]);
        float fdir_2  = gfd*deltaR[2] + temp5*(sci4*atomI.inducedDipoleP[2] + scip4*atomI.inducedDipole[2] + sci3*atomJ.inducedDipoleP[2] + scip3*atomJ.inducedDipole[2]);
        ftm2i_0      -= fdir_0 - findmp_0;
        ftm2i_1      -= fdir_1 - findmp_1;
        ftm2i_2      -= fdir_2 - findmp_2;

    }
Mark Friedrichs's avatar
Mark Friedrichs committed
458
459
460
    // now perform the torque calculation;
    // intermediate terms for torque between multipoles i and j;
    
Peter Eastman's avatar
Peter Eastman committed
461
462
463
464
465
    float gti2 = 0.5f*(sci4*psc1+scip4*dsc1)*rr5;
    float gti3 = 0.5f*(sci3*psc1+scip3*dsc1)*rr5;
    float gti4 = gfi4;
    float gti5 = gfi5;
    float gti6 = gfi6;
Mark Friedrichs's avatar
Mark Friedrichs committed
466
467
468

    // get the permanent (ttm2, ttm3) and induced interaction torques (ttm2i, ttm3i)
    
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
    acrossProductVector3(atomI.labFrameDipole,      atomJ.labFrameDipole,      temp1);
    acrossProductVector3(atomI.labFrameDipole,      atomJ.inducedDipole ,      temp2);
    acrossProductVector3(atomI.labFrameDipole,      atomJ.inducedDipoleP,     temp3);
    acrossProductVector3(atomI.labFrameDipole,      deltaR,       temp4);
    acrossProductVector3(deltaR,       qIuJp,   temp5);
    acrossProductVector3(deltaR,       qIr,     temp6);
    acrossProductVector3(deltaR,       qIuJ,    temp7);
    acrossProductVector3(atomJ.inducedDipole ,     qIr,     temp8);
    acrossProductVector3(atomJ.inducedDipoleP,     qIr,     temp9);
    acrossProductVector3(atomI.labFrameDipole,     qJr,     temp10);
    acrossProductVector3(atomJ.labFrameDipole,     qIr,     temp11);
    acrossProductVector3(deltaR,       qIqJr,   temp12);
    acrossProductVector3(deltaR,       qIdJ,    temp13);

    amatrixCrossProductMatrix3(atomI.labFrameQuadrupole,      atomJ.labFrameQuadrupole,      temp14);
    acrossProductVector3(qJr, qIr,     temp15);
Mark Friedrichs's avatar
Mark Friedrichs committed
485

Peter Eastman's avatar
Peter Eastman committed
486
487
488
489
490
491
    float ttm2_0  = -rr3*temp1[0] + gf2*temp4[0]-gf5*temp6[0] + gf4*(temp10[0] + temp11[0] + temp13[0]-2.0f*temp14[0]) - gf7*(temp12[0] + temp15[0]);
    float ttm2i_0 = -rr3*(temp2[0]*psc0+temp3[0]*dsc0)*0.5f + gti2*temp4[0] + gti4*((temp8[0]+ temp7[0])*psc1 + (temp9[0] + temp5[0])*dsc1)*0.5f - gti5*temp6[0];
    float ttm2_1  = -rr3*temp1[1] + gf2*temp4[1]-gf5*temp6[1] + gf4*(temp10[1] + temp11[1] + temp13[1]-2.0f*temp14[1]) - gf7*(temp12[1] + temp15[1]);
    float ttm2i_1 = -rr3*(temp2[1]*psc0+temp3[1]*dsc0)*0.5f + gti2*temp4[1] + gti4*((temp8[1]+ temp7[1])*psc1 + (temp9[1] + temp5[1])*dsc1)*0.5f - gti5*temp6[1];
    float ttm2_2  = -rr3*temp1[2] + gf2*temp4[2]-gf5*temp6[2] + gf4*(temp10[2] + temp11[2] + temp13[2]-2.0f*temp14[2]) - gf7*(temp12[2] + temp15[2]);
    float ttm2i_2 = -rr3*(temp2[2]*psc0+temp3[2]*dsc0)*0.5f + gti2*temp4[2] + gti4*((temp8[2]+ temp7[2])*psc1 + (temp9[2] + temp5[2])*dsc1)*0.5f - gti5*temp6[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
492

493
494
495
496
497
498
499
500
501
502
503
504
505
506
    acrossProductVector3(atomJ.labFrameDipole,      deltaR,       temp2  );
    acrossProductVector3(deltaR,       qJr,     temp3  );
    acrossProductVector3(atomI.labFrameDipole,      qJr,     temp4  );
    acrossProductVector3(atomJ.labFrameDipole,      qIr,     temp5  );
    acrossProductVector3(deltaR,       qJdI,    temp6  );
    acrossProductVector3(deltaR,       qJqIr,   temp7  );
    acrossProductVector3(qJr,     qIr,     temp8  ); // _qJrxqIr
    acrossProductVector3(atomJ.labFrameDipole,      atomI.inducedDipole ,      temp9  ); // _dJxuI
    acrossProductVector3(atomJ.labFrameDipole,      atomI.inducedDipoleP,     temp10 ); // _dJxuIp

    acrossProductVector3(atomI.inducedDipoleP,     qJr,     temp11 ); // _uIxqJrp
    acrossProductVector3(atomI.inducedDipole ,     qJr,     temp12 ); // _uIxqJr
    acrossProductVector3(deltaR,       qJuIp,   temp13 ); // _rxqJuIp
    acrossProductVector3(deltaR,       qJuI,    temp15 ); // _rxqJuI
Mark Friedrichs's avatar
Mark Friedrichs committed
507

Peter Eastman's avatar
Peter Eastman committed
508
509
510
511
512
513
    float ttm3_0 = rr3*temp1[0] + gf3*temp2[0] - gf6*temp3[0] - gf4*(temp4[0] + temp5[0] + temp6[0] - 2.0f*temp14[0]) - gf7*(temp7[0] - temp8[0]);
    float ttm3i_0 = -rr3*(temp9[0]*psc0+ temp10[0]*dsc0)*0.5f + gti3*temp2[0] - gti4*((temp12[0] + temp15[0])*psc1 + (temp11[0] + temp13[0])*dsc1)*0.5f - gti6*temp3[0];
    float ttm3_1 = rr3*temp1[1] + gf3*temp2[1] - gf6*temp3[1] - gf4*(temp4[1] + temp5[1] + temp6[1] - 2.0f*temp14[1]) - gf7*(temp7[1] - temp8[1]);
    float ttm3i_1 = -rr3*(temp9[1]*psc0+ temp10[1]*dsc0)*0.5f + gti3*temp2[1] - gti4*((temp12[1] + temp15[1])*psc1 + (temp11[1] + temp13[1])*dsc1)*0.5f - gti6*temp3[1];
    float ttm3_2 = rr3*temp1[2] + gf3*temp2[2] - gf6*temp3[2] - gf4*(temp4[2] + temp5[2] + temp6[2] - 2.0f*temp14[2]) - gf7*(temp7[2] - temp8[2]);
    float ttm3i_2 = -rr3*(temp9[2]*psc0+ temp10[2]*dsc0)*0.5f + gti3*temp2[2] - gti4*((temp12[2] + temp15[2])*psc1 + (temp11[2] + temp13[2])*dsc1)*0.5f - gti6*temp3[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
514
515
516

    if( scalingFactors[MScaleIndex] < 1.0f ){
    
517
518
519
        ftm2_0 *= scalingFactors[MScaleIndex];
        ftm2_1 *= scalingFactors[MScaleIndex];
        ftm2_2 *= scalingFactors[MScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
520
        
Peter Eastman's avatar
Peter Eastman committed
521
522
523
        ttm2_0 *= scalingFactors[MScaleIndex];
        ttm2_1 *= scalingFactors[MScaleIndex];
        ttm2_2 *= scalingFactors[MScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
524
        
Peter Eastman's avatar
Peter Eastman committed
525
526
527
        ttm3_0 *= scalingFactors[MScaleIndex];
        ttm3_1 *= scalingFactors[MScaleIndex];
        ttm3_2 *= scalingFactors[MScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
528
529
530
531
532
533
534
    
    }


#ifdef AMOEBA_DEBUG
if( 0 ){
int debugIndex               = 0;
535
536
537
538
    debugArray[debugIndex].x = scalingFactors[DScaleIndex];
    debugArray[debugIndex].y = scalingFactors[PScaleIndex];
    debugArray[debugIndex].z = scalingFactors[MScaleIndex];
    debugArray[debugIndex].w = scalingFactors[UScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
539
540

    debugIndex++;
541
542
543
544
    debugArray[debugIndex].x = ftm2i_0 + (fridmp_0 + findmp_0);
    debugArray[debugIndex].y = ftm2i_1 + (fridmp_1 + findmp_1);
    debugArray[debugIndex].z = ftm2i_2 + (fridmp_2 + findmp_2);
    debugArray[debugIndex].w = 1.5;
Mark Friedrichs's avatar
Mark Friedrichs committed
545

546
/*
Mark Friedrichs's avatar
Mark Friedrichs committed
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
    debugIndex++;
    debugArray[debugIndex].x = temp2[0];
    debugArray[debugIndex].y = temp2[1];
    debugArray[debugIndex].z = temp2[2];
    debugArray[debugIndex].w = 2.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp3[0];
    debugArray[debugIndex].y = temp3[1];
    debugArray[debugIndex].z = temp3[2];
    debugArray[debugIndex].w = 3.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp4[0];
    debugArray[debugIndex].y = temp4[1];
    debugArray[debugIndex].z = temp4[2];
    debugArray[debugIndex].w = 4.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp5[0];
    debugArray[debugIndex].y = temp5[1];
    debugArray[debugIndex].z = temp5[2];
    debugArray[debugIndex].w = 5.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp6[0];
    debugArray[debugIndex].y = temp6[1];
    debugArray[debugIndex].z = temp6[2];
    debugArray[debugIndex].w = 6.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp14[0];
    debugArray[debugIndex].y = temp14[1];
    debugArray[debugIndex].z = temp14[2];
    debugArray[debugIndex].w = 14.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp7[0];
    debugArray[debugIndex].y = temp7[1];
    debugArray[debugIndex].z = temp7[2];
    debugArray[debugIndex].w = 7.0f;


    debugIndex++;
    debugArray[debugIndex].x = temp8[0];
    debugArray[debugIndex].y = temp8[1];
    debugArray[debugIndex].z = temp8[2];
    debugArray[debugIndex].w = 8.0f;

    debugIndex++;
    debugArray[debugIndex].x = rr3;
Peter Eastman's avatar
Peter Eastman committed
598
599
    debugArray[debugIndex].y = gf3;
    debugArray[debugIndex].z = gf6;
Mark Friedrichs's avatar
Mark Friedrichs committed
600
601
602
    debugArray[debugIndex].w = 20.0f;

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
603
604
    debugArray[debugIndex].x = gf4;
    debugArray[debugIndex].y = gf7;
Mark Friedrichs's avatar
Mark Friedrichs committed
605
606
607
608
    debugArray[debugIndex].z = 0.0f;
    debugArray[debugIndex].w = 21.0f;

    debugIndex++;
609
610
611
    debugArray[debugIndex].x = atomJ.labFrameDipole[0];
    debugArray[debugIndex].y = atomJ.labFrameDipole[1];
    debugArray[debugIndex].z = atomJ.labFrameDipole[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
612
613
614
    debugArray[debugIndex].w = 22.0f;

    debugIndex++;
615
616
617
    debugArray[debugIndex].x = deltaR[0];
    debugArray[debugIndex].y = deltaR[1];
    debugArray[debugIndex].z = deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
618
    debugArray[debugIndex].w = 23.0f;
619
*/
Mark Friedrichs's avatar
Mark Friedrichs committed
620
621
622
623

}
#endif

624
625
626
    outputForce->x       = -(ftm2_0 + ftm2i_0);
    outputForce->y       = -(ftm2_1 + ftm2i_1);
    outputForce->z       = -(ftm2_2 + ftm2i_2);
Mark Friedrichs's avatar
Mark Friedrichs committed
627
    
628
629
630
    outputTorque[0].x    =  (ttm2_0 + ttm2i_0);
    outputTorque[0].y    =  (ttm2_1 + ttm2i_1);
    outputTorque[0].z    =  (ttm2_2 + ttm2i_2);
Mark Friedrichs's avatar
Mark Friedrichs committed
631

632
633
634
    outputTorque[1].x    =  (ttm3_0 + ttm3i_0);
    outputTorque[1].y    =  (ttm3_1 + ttm3i_1);
    outputTorque[1].z    =  (ttm3_2 + ttm3i_2);
Mark Friedrichs's avatar
Mark Friedrichs committed
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652

    return;

}

__device__ void loadElectrostaticShared( struct ElectrostaticParticle* sA, unsigned int atomI,
                                         float4* atomCoord, float* labFrameDipoleJ, float* labQuadrupole,
                                         float* inducedDipole, float* inducedDipolePolar, float2* dampingFactorAndThole )
{
    // coordinates & charge

    sA->x                        = atomCoord[atomI].x;
    sA->y                        = atomCoord[atomI].y;
    sA->z                        = atomCoord[atomI].z;
    sA->q                        = atomCoord[atomI].w;

    // lab dipole

653
654
655
    sA->labFrameDipole[0]         = labFrameDipoleJ[atomI*3];
    sA->labFrameDipole[1]         = labFrameDipoleJ[atomI*3+1];
    sA->labFrameDipole[2]         = labFrameDipoleJ[atomI*3+2];
Mark Friedrichs's avatar
Mark Friedrichs committed
656
657
658

    // lab quadrupole

659
660
661
662
663
664
665
666
667
    sA->labFrameQuadrupole[0]    = labQuadrupole[atomI*9];
    sA->labFrameQuadrupole[1]    = labQuadrupole[atomI*9+1];
    sA->labFrameQuadrupole[2]    = labQuadrupole[atomI*9+2];
    sA->labFrameQuadrupole[3]    = labQuadrupole[atomI*9+3];
    sA->labFrameQuadrupole[4]    = labQuadrupole[atomI*9+4];
    sA->labFrameQuadrupole[5]    = labQuadrupole[atomI*9+5];
    sA->labFrameQuadrupole[6]    = labQuadrupole[atomI*9+6];
    sA->labFrameQuadrupole[7]    = labQuadrupole[atomI*9+7];
    sA->labFrameQuadrupole[8]    = labQuadrupole[atomI*9+8];
Mark Friedrichs's avatar
Mark Friedrichs committed
668
669
670

    // induced dipole

671
672
673
    sA->inducedDipole[0]          = inducedDipole[atomI*3];
    sA->inducedDipole[1]          = inducedDipole[atomI*3+1];
    sA->inducedDipole[2]          = inducedDipole[atomI*3+2];
Mark Friedrichs's avatar
Mark Friedrichs committed
674
675
676

    // induced dipole polar

677
678
679
    sA->inducedDipoleP[0]         = inducedDipolePolar[atomI*3];
    sA->inducedDipoleP[1]         = inducedDipolePolar[atomI*3+1];
    sA->inducedDipoleP[2]         = inducedDipolePolar[atomI*3+2];
Mark Friedrichs's avatar
Mark Friedrichs committed
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695

    sA->damp                     = dampingFactorAndThole[atomI].x;
    sA->thole                    = dampingFactorAndThole[atomI].y;

}

// Include versions of the kernels for N^2 calculations.

#undef USE_OUTPUT_BUFFER_PER_WARP
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateAmoebaCudaElectrostatic.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaElectrostatic.h"

696
// reduce psWorkArray_3_1 -> torque
Mark Friedrichs's avatar
Mark Friedrichs committed
697

698
static void kReduceTorque(amoebaGpuContext amoebaGpu )
Mark Friedrichs's avatar
Mark Friedrichs committed
699
{
700
701
702
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
703
                               amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psTorque->_pDevData, 0 );
704
    LAUNCHERROR("kReduceElectrostaticTorque");
Mark Friedrichs's avatar
Mark Friedrichs committed
705
706
707
708
709
710
711
712
713
714
715
}

/**---------------------------------------------------------------------------------------

   Compute Amoeba electrostatic force & torque

   @param amoebaGpu        amoebaGpu context
   @param gpu              OpenMM gpu Cuda context

   --------------------------------------------------------------------------------------- */

716
void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu, int addTorqueToForce )
Mark Friedrichs's avatar
Mark Friedrichs committed
717
718
719
720
{
  
   // ---------------------------------------------------------------------------------------

Peter Eastman's avatar
Peter Eastman committed
721

Mark Friedrichs's avatar
Mark Friedrichs committed
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
#ifdef AMOEBA_DEBUG
    static const char* methodName = "cudaComputeAmoebaElectrostatic";
    static int timestep = 0;
    std::vector<int> fileId;
    timestep++;
    fileId.resize( 2 );
    fileId[0] = timestep;
    fileId[1] = 1;
#endif

    // ---------------------------------------------------------------------------------------

    gpuContext gpu = amoebaGpu->gpuContext;

    // apparently debug array can take up nontrivial no. registers

#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
Mark Friedrichs's avatar
Mark Friedrichs committed
740
741
        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
Mark Friedrichs's avatar
Mark Friedrichs committed
742
    }   
743
    static const int maxSlots                 =20;
744
    int paddedNumberOfAtoms                   = gpu->sim.paddedNumberOfAtoms;
745
746
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
Mark Friedrichs's avatar
Mark Friedrichs committed
747
    debugArray->Upload();
748
    unsigned int targetAtom                   = 237;
Mark Friedrichs's avatar
Mark Friedrichs committed
749
750
#endif

Peter Eastman's avatar
Peter Eastman committed
751
752
    // on first pass, set threads/block

753
    static unsigned int threadsPerBlock = 0;
Peter Eastman's avatar
Peter Eastman committed
754
755
756
    if( threadsPerBlock == 0 ){
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
Peter Eastman's avatar
Peter Eastman committed
757
            maxThreads = 384;
Peter Eastman's avatar
Peter Eastman committed
758
759
760
761
        else if (gpu->sm_version >= SM_12)
            maxThreads = 128;
        else
            maxThreads = 64;
Peter Eastman's avatar
Peter Eastman committed
762
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
Peter Eastman's avatar
Peter Eastman committed
763
764
    }

765
    kClearFields_3( amoebaGpu, 1 );
Mark Friedrichs's avatar
Mark Friedrichs committed
766

Mark Friedrichs's avatar
Mark Friedrichs committed
767
#ifdef AMOEBA_DEBUG
768
769
        if( amoebaGpu->log ){
            (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
770
                            gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
771
772
                            sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, (*gpu->psInteractionCount)[0], gpu->sim.workUnits ); (void) fflush( amoebaGpu->log );
        }
Mark Friedrichs's avatar
Mark Friedrichs committed
773
#endif
774

775
    if (gpu->bOutputBufferPerWarp){
776

777
        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
778
779
780
781
782
783
                                                                           amoebaGpu->psWorkUnit->_pDevData,
                                                                           gpu->psPosq4->_pDevData,
                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
                                                                           amoebaGpu->psInducedDipole->_pDevData,
                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
Mark Friedrichs's avatar
Mark Friedrichs committed
784
#ifdef AMOEBA_DEBUG
785
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
786
                                                                           debugArray->_pDevData, targetAtom );
Mark Friedrichs's avatar
Mark Friedrichs committed
787
#else
788
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
789
790
791
792
#endif

    } else {

793
        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
794
795
796
797
798
799
                                                                           amoebaGpu->psWorkUnit->_pDevData,
                                                                           gpu->psPosq4->_pDevData,
                                                                           amoebaGpu->psLabFrameDipole->_pDevData,
                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevData,
                                                                           amoebaGpu->psInducedDipole->_pDevData,
                                                                           amoebaGpu->psInducedDipolePolar->_pDevData,
Mark Friedrichs's avatar
Mark Friedrichs committed
800
#ifdef AMOEBA_DEBUG
801
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
802
                                                                           debugArray->_pDevData, targetAtom );
Mark Friedrichs's avatar
Mark Friedrichs committed
803
#else
804
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
805
806
807
808
#endif
    }
    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");

809
810
811
812
    if( addTorqueToForce ){
        kReduceTorque( amoebaGpu );
        cudaComputeAmoebaMapTorqueAndAddToForce( amoebaGpu, amoebaGpu->psTorque );
    }
Mark Friedrichs's avatar
Mark Friedrichs committed
813

814
815
816
817
818
    if( 0 ){
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL, 1.0f );
819
        //cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, NULL, 1.0f/4.184 );
820
821
822
823
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector, NULL, 1.0f/4.184 );
        cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
     }

Mark Friedrichs's avatar
Mark Friedrichs committed
824
825
   // ---------------------------------------------------------------------------------------
}