kCalculateAmoebaCudaElectrostatic.cu 41.9 KB
Newer Older
Mark Friedrichs's avatar
Mark Friedrichs committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
//-----------------------------------------------------------------------------------------

//-----------------------------------------------------------------------------------------

#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"

//#define AMOEBA_DEBUG

static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

void SetCalculateAmoebaElectrostaticSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaElectrostaticSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}

void GetCalculateAmoebaElectrostaticSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaElectrostaticSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}

static int const PScaleIndex            =  0; 
static int const DScaleIndex            =  1; 
static int const UScaleIndex            =  2; 
static int const MScaleIndex            =  3;
static int const Scale3Index            =  4;
static int const Scale5Index            =  5;
static int const Scale7Index            =  6;
static int const Scale9Index            =  7;
static int const Ddsc30Index            =  8;
//static int const Ddsc31Index            =  9;
//static int const Ddsc32Index            = 10; 
static int const Ddsc50Index            = 11;
//static int const Ddsc51Index            = 12;
//static int const Ddsc52Index            = 13; 
static int const Ddsc70Index            = 14;
//static int const Ddsc71Index            = 15;
//static int const Ddsc72Index            = 16;
51
52
53
54
55
56
57
static int const LastScalingIndex       = 17;

#define DOT3_4(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))

#define MATRIXDOT31(u,v) u[0]*v[0] + u[1]*v[1] + u[2]*v[2] + \
  u[3]*v[3] + u[4]*v[4] + u[5]*v[5] + \
  u[6]*v[6] + u[7]*v[7] + u[8]*v[8]
Mark Friedrichs's avatar
Mark Friedrichs committed
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

#define DOT31(u,v) ((u[0])*(v[0]) + (u[1])*(v[1]) + (u[2])*(v[2]))

#define i35 0.257142857f
#define one 1.0f

__device__ void acrossProductVector3(   float* vectorX, float* vectorY, float* vectorZ ){
    vectorZ[0]  = vectorX[1]*vectorY[2] - vectorX[2]*vectorY[1];
    vectorZ[1]  = vectorX[2]*vectorY[0] - vectorX[0]*vectorY[2];
    vectorZ[2]  = vectorX[0]*vectorY[1] - vectorX[1]*vectorY[0];
}

__device__ void amatrixProductVector3(   float* matrixX, float* vectorY, float* vectorZ ){
    vectorZ[0]  = matrixX[0]*vectorY[0] + matrixX[3]*vectorY[1] + matrixX[6]*vectorY[2];
    vectorZ[1]  = matrixX[1]*vectorY[0] + matrixX[4]*vectorY[1] + matrixX[7]*vectorY[2];
    vectorZ[2]  = matrixX[2]*vectorY[0] + matrixX[5]*vectorY[1] + matrixX[8]*vectorY[2];
}

__device__ void amatrixCrossProductMatrix3( float* matrixX, float* matrixY, float* vectorZ ){
  
    float* xPtr[3];
    float* yPtr[3];
        
    xPtr[0]    = matrixX;
    xPtr[1]    = matrixX + 3;
    xPtr[2]    = matrixX + 6;
    
    yPtr[0]    = matrixY;
    yPtr[1]    = matrixY + 3;
    yPtr[2]    = matrixY + 6;
          
    vectorZ[0] = DOT31( xPtr[1], yPtr[2] ) - DOT31( xPtr[2], yPtr[1] );
    vectorZ[1] = DOT31( xPtr[2], yPtr[0] ) - DOT31( xPtr[0], yPtr[2] );
    vectorZ[2] = DOT31( xPtr[0], yPtr[1] ) - DOT31( xPtr[1], yPtr[0] );
  
}

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
struct ElectrostaticParticle {

    // coordinates charge

    float x;
    float y;
    float z;
    float q;

    // lab frame dipole

    float labFrameDipole[3];

    // lab frame quadrupole

    float labFrameQuadrupole[9];

    // induced dipole

    float inducedDipole[3];

    // polar induced dipole

    float inducedDipoleP[3];

    // scaling factors

    float thole;
    float damp;

    float force[3];

    float torque[3];
    float padding;

};

__device__ void calculateElectrostaticPairIxn_kernel( ElectrostaticParticle& atomI,   ElectrostaticParticle& atomJ,
Mark Friedrichs's avatar
Mark Friedrichs committed
133
134
135
136
137
138
139
140
                                                      float scalingDistanceCutoff,   float* scalingFactors,
                                                      float*  outputForce,           float  outputTorque[2][3],
                                                      float* energy
#ifdef AMOEBA_DEBUG
                                                      ,float4* debugArray 
#endif
 ){
  
141
    float deltaR[3];
Mark Friedrichs's avatar
Mark Friedrichs committed
142
143
144
145
146
147
148
149
150
    
    // ---------------------------------------------------------------------------------------
    
    // ---------------------------------------------------------------------------------------

    float* ddsc3                    =  scalingFactors + Ddsc30Index;
    float* ddsc5                    =  scalingFactors + Ddsc50Index;
    float* ddsc7                    =  scalingFactors + Ddsc70Index;

151
152
153
    deltaR[0]                       = atomJ.x - atomI.x;
    deltaR[1]                       = atomJ.y - atomI.y;
    deltaR[2]                       = atomJ.z - atomI.z;
Mark Friedrichs's avatar
Mark Friedrichs committed
154
155
156
157
158
159
160
161
162
163
164
165
166

    float r2                        = DOT31( deltaR, deltaR );
    float r                         = sqrtf( r2 );
    float rr1                       = 1.0f/r;
    float rr2                       = rr1*rr1;
    float rr3                       = rr1*rr2;
    float rr5                       = 3.0f*rr3*rr2;
    float rr7                       = 5.0f*rr5*rr2;
    float rr9                       = 7.0f*rr7*rr2;
    float rr11                      = 9.0f*rr9*rr2;

    //-------------------------------------------

167
    if( atomI.damp != 0.0f && atomJ.damp != 0.0 && r < scalingDistanceCutoff ){
Mark Friedrichs's avatar
Mark Friedrichs committed
168
169
170
171
172
   
        float distanceIJ, r2I;
        distanceIJ                    = r;
        r2I                           = rr2;
        
173
174
        float ratio                   = distanceIJ/(atomI.damp*atomJ.damp);
        float pGamma                  = atomJ.thole > atomI.thole ? atomI.thole : atomJ.thole;
Mark Friedrichs's avatar
Mark Friedrichs committed
175

176
        float damp                          = ratio*ratio*ratio*pGamma;
Mark Friedrichs's avatar
Mark Friedrichs committed
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
        float dampExp                 = expf( -damp );
        float damp1                   = damp + one;
        float damp2                   = damp*damp;
        float damp3                   = damp2*damp;

        scalingFactors[Scale3Index]   = one - dampExp;
        scalingFactors[Scale5Index]   = one - damp1*dampExp;
        scalingFactors[Scale7Index]   = one - ( damp1 + 0.6f*damp2)*dampExp;
        scalingFactors[Scale9Index]   = one - ( damp1 + ( 2.0f*damp2 + damp3 )*i35)*dampExp;

        float factor                  = 3.0f*damp*dampExp*r2I;
        float factor7                 = -0.2f + 0.6f*damp;
        
        for( int ii = 0; ii < 3; ii++ ){
            scalingFactors[Ddsc30Index + ii] = factor*deltaR[ii];
            scalingFactors[Ddsc50Index + ii] = scalingFactors[Ddsc30Index + ii]*damp;
            scalingFactors[Ddsc70Index + ii] = scalingFactors[Ddsc50Index + ii]*factor7;
        }

    }
      
Peter Eastman's avatar
Peter Eastman committed
198
199
200
201
202
203
204
205
    float scaleI0 = scalingFactors[Scale3Index]*scalingFactors[UScaleIndex];
    float dsc0    = scalingFactors[Scale3Index]*scalingFactors[DScaleIndex];
    float psc0    = scalingFactors[Scale3Index]*scalingFactors[PScaleIndex];
    float scaleI1 = scalingFactors[Scale3Index+1]*scalingFactors[UScaleIndex];
    float dsc1    = scalingFactors[Scale3Index+1]*scalingFactors[DScaleIndex];
    float psc1    = scalingFactors[Scale3Index+1]*scalingFactors[PScaleIndex];
    float dsc2    = scalingFactors[Scale3Index+2]*scalingFactors[DScaleIndex];
    float psc2    = scalingFactors[Scale3Index+2]*scalingFactors[PScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
206
                       
207
    float qIr[3], qJr[3];
Mark Friedrichs's avatar
Mark Friedrichs committed
208

209
210
    amatrixProductVector3( atomJ.labFrameQuadrupole,      deltaR,      qJr);
    amatrixProductVector3( atomI.labFrameQuadrupole,      deltaR,      qIr);
Mark Friedrichs's avatar
Mark Friedrichs committed
211

Peter Eastman's avatar
Peter Eastman committed
212
213
214
    float sc2     = DOT3_4(        atomI.labFrameDipole,  atomJ.labFrameDipole );
    float sc3     = DOT3_4(        atomI.labFrameDipole,  deltaR  );
    float sc4     = DOT3_4(        atomJ.labFrameDipole,  deltaR  );
Mark Friedrichs's avatar
Mark Friedrichs committed
215
    
Peter Eastman's avatar
Peter Eastman committed
216
217
    float sc5     = DOT3_4(        qIr, deltaR  );
    float sc6     = DOT3_4(        qJr, deltaR  );
Mark Friedrichs's avatar
Mark Friedrichs committed
218
    
Peter Eastman's avatar
Peter Eastman committed
219
220
    float sc7     = DOT3_4(        qIr, atomJ.labFrameDipole );
    float sc8     = DOT3_4(        qJr, atomI.labFrameDipole );
Mark Friedrichs's avatar
Mark Friedrichs committed
221
    
Peter Eastman's avatar
Peter Eastman committed
222
    float sc9     = DOT3_4(        qIr, qJr );
Mark Friedrichs's avatar
Mark Friedrichs committed
223
    
Peter Eastman's avatar
Peter Eastman committed
224
    float sc10    = MATRIXDOT31( atomI.labFrameQuadrupole, atomJ.labFrameQuadrupole );
Mark Friedrichs's avatar
Mark Friedrichs committed
225
    
Peter Eastman's avatar
Peter Eastman committed
226
227
228
229
230
    float sci1    = DOT3_4(        atomI.inducedDipole,  atomJ.labFrameDipole ) +
                    DOT3_4(        atomJ.inducedDipole,  atomI.labFrameDipole );
        
    float sci3    = DOT3_4(        atomI.inducedDipole,  deltaR  );
    float sci4    = DOT3_4(        atomJ.inducedDipole,  deltaR  );
Mark Friedrichs's avatar
Mark Friedrichs committed
231
    
Peter Eastman's avatar
Peter Eastman committed
232
233
    float sci7    = DOT3_4(        qIr, atomJ.inducedDipole );
    float sci8    = DOT3_4(        qJr, atomI.inducedDipole );
Mark Friedrichs's avatar
Mark Friedrichs committed
234
    
Peter Eastman's avatar
Peter Eastman committed
235
236
    float scip1   = DOT3_4(        atomI.inducedDipoleP, atomJ.labFrameDipole ) +
                    DOT3_4(        atomJ.inducedDipoleP, atomI.labFrameDipole );
Mark Friedrichs's avatar
Mark Friedrichs committed
237
    
Peter Eastman's avatar
Peter Eastman committed
238
239
    float scip2   = DOT3_4(        atomI.inducedDipole,  atomJ.inducedDipoleP) +
                    DOT3_4(        atomJ.inducedDipole,  atomI.inducedDipoleP);
Mark Friedrichs's avatar
Mark Friedrichs committed
240
    
Peter Eastman's avatar
Peter Eastman committed
241
242
    float scip3   = DOT3_4(        atomI.inducedDipoleP, deltaR );
    float scip4   = DOT3_4(        atomJ.inducedDipoleP, deltaR );
Mark Friedrichs's avatar
Mark Friedrichs committed
243
    
Peter Eastman's avatar
Peter Eastman committed
244
245
    float scip7   = DOT3_4(        qIr, atomJ.inducedDipoleP );
    float scip8   = DOT3_4(        qJr, atomI.inducedDipoleP );
Mark Friedrichs's avatar
Mark Friedrichs committed
246
247
248

    float findmp[3];
    float scaleF         = 0.5f*scalingFactors[UScaleIndex];
Peter Eastman's avatar
Peter Eastman committed
249
250
    float inducedFactor3 = scip2*rr3*scaleF;
    float inducedFactor5 = (sci3*scip4+scip3*sci4)*rr5*scaleF;
Mark Friedrichs's avatar
Mark Friedrichs committed
251
252
253
254
    findmp[0]            = inducedFactor3*ddsc3[0] - inducedFactor5*ddsc5[0];
    findmp[1]            = inducedFactor3*ddsc3[1] - inducedFactor5*ddsc5[1];
    findmp[2]            = inducedFactor3*ddsc3[2] - inducedFactor5*ddsc5[2];

Peter Eastman's avatar
Peter Eastman committed
255
256
257
258
259
    float gli1               = atomJ.q*sci3 - atomI.q*sci4;
    float gli2               = -sc3*sci4 - sci3*sc4;
    float gli3               = sci3*sc6 - sci4*sc5;
    float gli6               = sci1;
    float gli7               = 2.0f*(sci7-sci8);
Mark Friedrichs's avatar
Mark Friedrichs committed
260
    
Peter Eastman's avatar
Peter Eastman committed
261
262
263
264
265
    float glip1              = atomJ.q*scip3 - atomI.q*scip4;
    float glip2              = -sc3*scip4 - scip3*sc4;
    float glip3              = scip3*sc6 - scip4*sc5;
    float glip6              = scip1;
    float glip7              = 2.0f*(scip7-scip8);
Mark Friedrichs's avatar
Mark Friedrichs committed
266
267
268
269
270
    
    float fridmp[3];
    float factor3, factor5, factor7;
    
    if( scalingFactors[PScaleIndex] == 1.0f && scalingFactors[PScaleIndex] == 1.0f ){
Peter Eastman's avatar
Peter Eastman committed
271
272
273
        factor3 = rr3*( gli1  +  gli6  + glip1  + glip6 );
        factor5 = rr5*( gli2  +  gli7  + glip2  + glip7 );
        factor7 = rr7*( gli3  + glip3 );
Mark Friedrichs's avatar
Mark Friedrichs committed
274
    } else {
Peter Eastman's avatar
Peter Eastman committed
275
276
        factor3 = rr3*(( gli1  +  gli6)*scalingFactors[PScaleIndex] +
                       (glip1  + glip6)*scalingFactors[DScaleIndex]);
Mark Friedrichs's avatar
Mark Friedrichs committed
277
   
Peter Eastman's avatar
Peter Eastman committed
278
279
       factor5 = rr5*(( gli2  +  gli7)*scalingFactors[PScaleIndex] +
                      (glip2  + glip7)*scalingFactors[DScaleIndex]);
Mark Friedrichs's avatar
Mark Friedrichs committed
280
   
Peter Eastman's avatar
Peter Eastman committed
281
       factor7 = rr7*( gli3*scalingFactors[PScaleIndex] + glip3*scalingFactors[DScaleIndex]);
Mark Friedrichs's avatar
Mark Friedrichs committed
282
283
284
285
286
287
    }
      
    fridmp[0] = 0.5f*(factor3*ddsc3[0] + factor5*ddsc5[0] + factor7*ddsc7[0]);
    fridmp[1] = 0.5f*(factor3*ddsc3[1] + factor5*ddsc5[1] + factor7*ddsc7[1]);
    fridmp[2] = 0.5f*(factor3*ddsc3[2] + factor5*ddsc5[2] + factor7*ddsc7[2]);
      
Peter Eastman's avatar
Peter Eastman committed
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
    float gl0 = atomI.q*atomJ.q;
    float gl1 = atomJ.q*sc3 - atomI.q*sc4;
    float gl2 = atomI.q*sc6 + atomJ.q*sc5 - sc3*sc4;
    float gl3 = sc3*sc6 - sc4*sc5;
    float gl4 = sc5*sc6;
    float gl6 = sc2;
    float gl7 = 2.0f*(sc7-sc8);
    float gl8 = 2.0f*sc10;
    float gl5 = -4.0f*sc9;
    
    float gf1 = rr3*gl0 + rr5*(gl1+gl6) + rr7*(gl2+gl7+gl8) + rr9*(gl3+gl5) + rr11*gl4;
    float gf2 = -atomJ.q*rr3 + sc4*rr5 - sc6*rr7;
    float gf3 =  atomI.q*rr3 + sc3*rr5 + sc5*rr7;
    float gf4 = 2.0f*rr5;
    float gf5 = 2.0f*(-atomJ.q*rr5+sc4*rr7-sc6*rr9);
    float gf6 = 2.0f*(-atomI.q*rr5-sc3*rr7-sc5*rr9);
    float gf7 = 4.0f*rr7;
Mark Friedrichs's avatar
Mark Friedrichs committed
305
306
307
308

    // energy

    float conversionFactor   = (cAmoebaSim.electric/cAmoebaSim.dielec);
Peter Eastman's avatar
Peter Eastman committed
309
310
    float em                 = scalingFactors[MScaleIndex]*(rr1*gl0 + rr3*(gl1+gl6) + rr5*(gl2+gl7+gl8) + rr7*(gl3+gl5) + rr9*gl4);
    float ei                 = 0.5f*(rr3*(gli1+gli6)*psc0 + rr5*(gli2+gli7)*psc1 + rr7*gli3*psc2);
Mark Friedrichs's avatar
Mark Friedrichs committed
311
312
313
314
315
316
317
318
319
320
321
322
    *energy                  = conversionFactor*(em+ei);
    
#ifdef AMOEBA_DEBUG
#if 0
if( 1 ){
    int debugIndex           = 0;
    debugArray[debugIndex].x = conversionFactor*em;
    debugArray[debugIndex].y = conversionFactor*ei;
    debugArray[debugIndex].z = rr1;
    debugArray[debugIndex].w = rr3;

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
323
324
325
326
    debugArray[debugIndex].x = gl0;
    debugArray[debugIndex].y = gl1;
    debugArray[debugIndex].z = gl6;
    debugArray[debugIndex].w = gl2;
Mark Friedrichs's avatar
Mark Friedrichs committed
327
328

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
329
330
331
332
    debugArray[debugIndex].x = gli1;
    debugArray[debugIndex].y = gli3;
    debugArray[debugIndex].z = gli2;
    debugArray[debugIndex].w = gli7;
Mark Friedrichs's avatar
Mark Friedrichs committed
333
334

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
335
336
337
    debugArray[debugIndex].x = psc0;
    debugArray[debugIndex].y = psc1;
    debugArray[debugIndex].z = psc2;
Mark Friedrichs's avatar
Mark Friedrichs committed
338
339
340
341
342
343
344
345
    debugArray[debugIndex].w = scalingFactors[MScaleIndex];

}
#endif
#endif

    float ftm2[3];
    float temp1[3],temp2[3],temp3[3];
346
347
348
    float qIqJr[3], qJqIr[3], qIdJ[3], qJdI[3];
    amatrixProductVector3( atomI.labFrameQuadrupole,      atomJ.labFrameDipole,     qIdJ );//MK
    amatrixProductVector3( atomJ.labFrameQuadrupole,      atomI.labFrameDipole,     qJdI );//MK
Mark Friedrichs's avatar
Mark Friedrichs committed
349

350
351
352
353
    amatrixProductVector3( atomI.labFrameQuadrupole,      qJr,    qIqJr );//MK
    amatrixProductVector3( atomJ.labFrameQuadrupole,      qIr,    qJqIr );//MK
    amatrixProductVector3( atomJ.labFrameQuadrupole,      qIr,    temp1 );
    amatrixProductVector3( atomJ.labFrameQuadrupole,      atomI.labFrameDipole,     temp2 );
Mark Friedrichs's avatar
Mark Friedrichs committed
354
355

    for( int ii = 0; ii < 3; ii++ ){
Peter Eastman's avatar
Peter Eastman committed
356
357
358
359
360
        ftm2[ii] = gf1*deltaR[ii]                             +
                   gf2*atomI.labFrameDipole[ii]     + gf3*atomJ.labFrameDipole[ii]  +
                   gf4*(temp2[ii]  - qIdJ[ii])                    +
                   gf5*qIr[ii]    + gf6*qJr[ii] +
                   gf7*(qIqJr[ii] + temp1[ii]);
Mark Friedrichs's avatar
Mark Friedrichs committed
361
362
363
364
365
366
367
    
    }

    // get the induced force;

    // intermediate variables for the induced-permanent terms;
    
Peter Eastman's avatar
Peter Eastman committed
368
369
370
371
372
    float gfi1 = rr5*0.5f*((gli1+gli6)*psc0 + (glip1+glip6)*dsc0 + scip2*scaleI0) + rr7*((gli7+gli2)*psc1 + (glip7+glip2)*dsc1 -
                                                       (sci3*scip4+scip3*sci4)*scaleI1)*0.5f + rr9*(gli3*psc2+glip3*dsc2)*0.5f;
    float gfi4 = 2.0f*rr5;
    float gfi5 = rr7* (sci4*psc2 + scip4*dsc2);
    float gfi6 = -rr7*(sci3*psc2 + scip3*dsc2);
Mark Friedrichs's avatar
Mark Friedrichs committed
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387


    float ftm2i[3];
    float temp4[3];
    float temp5[3];
    float temp6[3];
    float temp7[3];
    float temp8[3];
    float temp9[3];
    float temp10[3];
    float temp11[3];
    float temp12[3];
    float temp13[3];
    float temp14[3];
    float temp15[3];
388
389
    float qIuJp[3], qJuIp[3];
    float qIuJ[3], qJuI[3];
Mark Friedrichs's avatar
Mark Friedrichs committed
390

391
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipoleP,    temp4);
Mark Friedrichs's avatar
Mark Friedrichs committed
392

393
394
395
    amatrixProductVector3(atomI.labFrameQuadrupole,      atomJ.inducedDipoleP,    qIuJp);//MK
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipoleP,    qJuIp);//MK
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipole ,    qJuI);//MK
Mark Friedrichs's avatar
Mark Friedrichs committed
396

397
398
    amatrixProductVector3(atomJ.labFrameQuadrupole,      atomI.inducedDipole,    temp5);
    amatrixProductVector3(atomI.labFrameQuadrupole,      atomJ.inducedDipole ,     qIuJ);//MK
Mark Friedrichs's avatar
Mark Friedrichs committed
399
400
401

    float temp1_0,temp2_0,temp3_0;
    for( int ii = 0; ii < 3; ii++ ){
Peter Eastman's avatar
Peter Eastman committed
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
        temp1_0 = gfi1*deltaR[ii] +
                  0.5f*(-rr3*atomJ.q*(atomI.inducedDipole[ii]*psc0 + atomI.inducedDipoleP[ii]*dsc0) +
                  rr5*sc4*(atomI.inducedDipole[ii]*psc1 + atomI.inducedDipoleP[ii]*dsc1) -
                  rr7*sc6*(atomI.inducedDipole[ii]*psc2 + atomI.inducedDipoleP[ii]*dsc2)) ;

        temp2_0 = (rr3*atomI.q*(atomJ.inducedDipole[ii]*psc0+atomJ.inducedDipoleP[ii]*dsc0) +
                   rr5*sc3*(atomJ.inducedDipole[ii]*psc1 +atomJ.inducedDipoleP[ii]*dsc1) +
                   rr7*sc5*(atomJ.inducedDipole[ii]*psc2 +atomJ.inducedDipoleP[ii]*dsc2))*0.5f +
                   rr5*scaleI1*(sci4*atomI.inducedDipoleP[ii]+scip4*atomI.inducedDipole[ii] +
                   sci3*atomJ.inducedDipoleP[ii]+scip3*atomJ.inducedDipole[ii])*0.5f ;

        temp3_0 = 0.5f*(sci4*psc1+scip4*dsc1)*rr5*atomI.labFrameDipole[ii] +
                  0.5f*(sci3*psc1+scip3*dsc1)*rr5*atomJ.labFrameDipole[ii] +
                  0.5f*gfi4*((temp5[ii]-qIuJ[ii])*psc1 +
                  (temp4[ii]-qIuJp[ii])*dsc1) + gfi5*qIr[ii] + gfi6*qJr[ii];
Mark Friedrichs's avatar
Mark Friedrichs committed
417
418
419
420
421
422
423
424
425
426
427
428
429
        ftm2i[ii] = temp1_0 + temp2_0 + temp3_0;
    }

    // handle of scaling for partially excluded interactions;
    // correction to convert mutual to direct polarization force;
    
    ftm2i[0] -= (fridmp[0] + findmp[0]);
    ftm2i[1] -= (fridmp[1] + findmp[1]);
    ftm2i[2] -= (fridmp[2] + findmp[2]);
    
    // now perform the torque calculation;
    // intermediate terms for torque between multipoles i and j;
    
Peter Eastman's avatar
Peter Eastman committed
430
431
432
433
434
    float gti2 = 0.5f*(sci4*psc1+scip4*dsc1)*rr5;
    float gti3 = 0.5f*(sci3*psc1+scip3*dsc1)*rr5;
    float gti4 = gfi4;
    float gti5 = gfi5;
    float gti6 = gfi6;
Mark Friedrichs's avatar
Mark Friedrichs committed
435
436
437

    // get the permanent (ttm2, ttm3) and induced interaction torques (ttm2i, ttm3i)
    
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
    acrossProductVector3(atomI.labFrameDipole,      atomJ.labFrameDipole,      temp1);
    acrossProductVector3(atomI.labFrameDipole,      atomJ.inducedDipole ,      temp2);
    acrossProductVector3(atomI.labFrameDipole,      atomJ.inducedDipoleP,     temp3);
    acrossProductVector3(atomI.labFrameDipole,      deltaR,       temp4);
    acrossProductVector3(deltaR,       qIuJp,   temp5);
    acrossProductVector3(deltaR,       qIr,     temp6);
    acrossProductVector3(deltaR,       qIuJ,    temp7);
    acrossProductVector3(atomJ.inducedDipole ,     qIr,     temp8);
    acrossProductVector3(atomJ.inducedDipoleP,     qIr,     temp9);
    acrossProductVector3(atomI.labFrameDipole,     qJr,     temp10);
    acrossProductVector3(atomJ.labFrameDipole,     qIr,     temp11);
    acrossProductVector3(deltaR,       qIqJr,   temp12);
    acrossProductVector3(deltaR,       qIdJ,    temp13);

    amatrixCrossProductMatrix3(atomI.labFrameQuadrupole,      atomJ.labFrameQuadrupole,      temp14);
    acrossProductVector3(qJr, qIr,     temp15);
Mark Friedrichs's avatar
Mark Friedrichs committed
454

Peter Eastman's avatar
Peter Eastman committed
455
456
457
458
459
460
    float ttm2_0  = -rr3*temp1[0] + gf2*temp4[0]-gf5*temp6[0] + gf4*(temp10[0] + temp11[0] + temp13[0]-2.0f*temp14[0]) - gf7*(temp12[0] + temp15[0]);
    float ttm2i_0 = -rr3*(temp2[0]*psc0+temp3[0]*dsc0)*0.5f + gti2*temp4[0] + gti4*((temp8[0]+ temp7[0])*psc1 + (temp9[0] + temp5[0])*dsc1)*0.5f - gti5*temp6[0];
    float ttm2_1  = -rr3*temp1[1] + gf2*temp4[1]-gf5*temp6[1] + gf4*(temp10[1] + temp11[1] + temp13[1]-2.0f*temp14[1]) - gf7*(temp12[1] + temp15[1]);
    float ttm2i_1 = -rr3*(temp2[1]*psc0+temp3[1]*dsc0)*0.5f + gti2*temp4[1] + gti4*((temp8[1]+ temp7[1])*psc1 + (temp9[1] + temp5[1])*dsc1)*0.5f - gti5*temp6[1];
    float ttm2_2  = -rr3*temp1[2] + gf2*temp4[2]-gf5*temp6[2] + gf4*(temp10[2] + temp11[2] + temp13[2]-2.0f*temp14[2]) - gf7*(temp12[2] + temp15[2]);
    float ttm2i_2 = -rr3*(temp2[2]*psc0+temp3[2]*dsc0)*0.5f + gti2*temp4[2] + gti4*((temp8[2]+ temp7[2])*psc1 + (temp9[2] + temp5[2])*dsc1)*0.5f - gti5*temp6[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
461

462
463
464
465
466
467
468
469
470
471
472
473
474
475
    acrossProductVector3(atomJ.labFrameDipole,      deltaR,       temp2  );
    acrossProductVector3(deltaR,       qJr,     temp3  );
    acrossProductVector3(atomI.labFrameDipole,      qJr,     temp4  );
    acrossProductVector3(atomJ.labFrameDipole,      qIr,     temp5  );
    acrossProductVector3(deltaR,       qJdI,    temp6  );
    acrossProductVector3(deltaR,       qJqIr,   temp7  );
    acrossProductVector3(qJr,     qIr,     temp8  ); // _qJrxqIr
    acrossProductVector3(atomJ.labFrameDipole,      atomI.inducedDipole ,      temp9  ); // _dJxuI
    acrossProductVector3(atomJ.labFrameDipole,      atomI.inducedDipoleP,     temp10 ); // _dJxuIp

    acrossProductVector3(atomI.inducedDipoleP,     qJr,     temp11 ); // _uIxqJrp
    acrossProductVector3(atomI.inducedDipole ,     qJr,     temp12 ); // _uIxqJr
    acrossProductVector3(deltaR,       qJuIp,   temp13 ); // _rxqJuIp
    acrossProductVector3(deltaR,       qJuI,    temp15 ); // _rxqJuI
Mark Friedrichs's avatar
Mark Friedrichs committed
476

Peter Eastman's avatar
Peter Eastman committed
477
478
479
480
481
482
    float ttm3_0 = rr3*temp1[0] + gf3*temp2[0] - gf6*temp3[0] - gf4*(temp4[0] + temp5[0] + temp6[0] - 2.0f*temp14[0]) - gf7*(temp7[0] - temp8[0]);
    float ttm3i_0 = -rr3*(temp9[0]*psc0+ temp10[0]*dsc0)*0.5f + gti3*temp2[0] - gti4*((temp12[0] + temp15[0])*psc1 + (temp11[0] + temp13[0])*dsc1)*0.5f - gti6*temp3[0];
    float ttm3_1 = rr3*temp1[1] + gf3*temp2[1] - gf6*temp3[1] - gf4*(temp4[1] + temp5[1] + temp6[1] - 2.0f*temp14[1]) - gf7*(temp7[1] - temp8[1]);
    float ttm3i_1 = -rr3*(temp9[1]*psc0+ temp10[1]*dsc0)*0.5f + gti3*temp2[1] - gti4*((temp12[1] + temp15[1])*psc1 + (temp11[1] + temp13[1])*dsc1)*0.5f - gti6*temp3[1];
    float ttm3_2 = rr3*temp1[2] + gf3*temp2[2] - gf6*temp3[2] - gf4*(temp4[2] + temp5[2] + temp6[2] - 2.0f*temp14[2]) - gf7*(temp7[2] - temp8[2]);
    float ttm3i_2 = -rr3*(temp9[2]*psc0+ temp10[2]*dsc0)*0.5f + gti3*temp2[2] - gti4*((temp12[2] + temp15[2])*psc1 + (temp11[2] + temp13[2])*dsc1)*0.5f - gti6*temp3[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
483
484
485
486
487
488
489

    if( scalingFactors[MScaleIndex] < 1.0f ){
    
        ftm2[0] *= scalingFactors[MScaleIndex];
        ftm2[1] *= scalingFactors[MScaleIndex];
        ftm2[2] *= scalingFactors[MScaleIndex];
        
Peter Eastman's avatar
Peter Eastman committed
490
491
492
        ttm2_0 *= scalingFactors[MScaleIndex];
        ttm2_1 *= scalingFactors[MScaleIndex];
        ttm2_2 *= scalingFactors[MScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
493
        
Peter Eastman's avatar
Peter Eastman committed
494
495
496
        ttm3_0 *= scalingFactors[MScaleIndex];
        ttm3_1 *= scalingFactors[MScaleIndex];
        ttm3_2 *= scalingFactors[MScaleIndex];
Mark Friedrichs's avatar
Mark Friedrichs committed
497
498
499
500
501
502
503
504
505
506
    
    }


#ifdef AMOEBA_DEBUG
#if 0
if( 0 ){
int debugIndex               = 0;
    debugArray[debugIndex].x = conversionFactor*ftm2[0];
    debugArray[debugIndex].y = conversionFactor*ftm2i[0];
Peter Eastman's avatar
Peter Eastman committed
507
508
    debugArray[debugIndex].z = conversionFactor*ttm3_0;
    debugArray[debugIndex].w = conversionFactor*ttm3i_0;
Mark Friedrichs's avatar
Mark Friedrichs committed
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566

    debugIndex++;
    debugArray[debugIndex].x = temp1[0];
    debugArray[debugIndex].y = temp1[1];
    debugArray[debugIndex].z = temp1[2];
    debugArray[debugIndex].w = 1.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp2[0];
    debugArray[debugIndex].y = temp2[1];
    debugArray[debugIndex].z = temp2[2];
    debugArray[debugIndex].w = 2.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp3[0];
    debugArray[debugIndex].y = temp3[1];
    debugArray[debugIndex].z = temp3[2];
    debugArray[debugIndex].w = 3.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp4[0];
    debugArray[debugIndex].y = temp4[1];
    debugArray[debugIndex].z = temp4[2];
    debugArray[debugIndex].w = 4.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp5[0];
    debugArray[debugIndex].y = temp5[1];
    debugArray[debugIndex].z = temp5[2];
    debugArray[debugIndex].w = 5.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp6[0];
    debugArray[debugIndex].y = temp6[1];
    debugArray[debugIndex].z = temp6[2];
    debugArray[debugIndex].w = 6.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp14[0];
    debugArray[debugIndex].y = temp14[1];
    debugArray[debugIndex].z = temp14[2];
    debugArray[debugIndex].w = 14.0f;

    debugIndex++;
    debugArray[debugIndex].x = temp7[0];
    debugArray[debugIndex].y = temp7[1];
    debugArray[debugIndex].z = temp7[2];
    debugArray[debugIndex].w = 7.0f;


    debugIndex++;
    debugArray[debugIndex].x = temp8[0];
    debugArray[debugIndex].y = temp8[1];
    debugArray[debugIndex].z = temp8[2];
    debugArray[debugIndex].w = 8.0f;

    debugIndex++;
    debugArray[debugIndex].x = rr3;
Peter Eastman's avatar
Peter Eastman committed
567
568
    debugArray[debugIndex].y = gf3;
    debugArray[debugIndex].z = gf6;
Mark Friedrichs's avatar
Mark Friedrichs committed
569
570
571
    debugArray[debugIndex].w = 20.0f;

    debugIndex++;
Peter Eastman's avatar
Peter Eastman committed
572
573
    debugArray[debugIndex].x = gf4;
    debugArray[debugIndex].y = gf7;
Mark Friedrichs's avatar
Mark Friedrichs committed
574
575
576
577
    debugArray[debugIndex].z = 0.0f;
    debugArray[debugIndex].w = 21.0f;

    debugIndex++;
578
579
580
    debugArray[debugIndex].x = atomJ.labFrameDipole[0];
    debugArray[debugIndex].y = atomJ.labFrameDipole[1];
    debugArray[debugIndex].z = atomJ.labFrameDipole[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
581
582
583
    debugArray[debugIndex].w = 22.0f;

    debugIndex++;
584
585
586
    debugArray[debugIndex].x = deltaR[0];
    debugArray[debugIndex].y = deltaR[1];
    debugArray[debugIndex].z = deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
587
588
589
590
591
592
593
594
595
596
    debugArray[debugIndex].w = 23.0f;

}
#endif
#endif

    outputForce[0]        = -conversionFactor*(ftm2[0] + ftm2i[0]);
    outputForce[1]        = -conversionFactor*(ftm2[1] + ftm2i[1]);
    outputForce[2]        = -conversionFactor*(ftm2[2] + ftm2i[2]);
    
Peter Eastman's avatar
Peter Eastman committed
597
598
599
    outputTorque[0][0]    = conversionFactor*(ttm2_0 + ttm2i_0);
    outputTorque[0][1]    = conversionFactor*(ttm2_1 + ttm2i_1);
    outputTorque[0][2]    = conversionFactor*(ttm2_2 + ttm2i_2);
Mark Friedrichs's avatar
Mark Friedrichs committed
600

Peter Eastman's avatar
Peter Eastman committed
601
602
603
    outputTorque[1][0]    = conversionFactor*(ttm3_0 + ttm3i_0);
    outputTorque[1][1]    = conversionFactor*(ttm3_1 + ttm3i_1);
    outputTorque[1][2]    = conversionFactor*(ttm3_2 + ttm3i_2);
Mark Friedrichs's avatar
Mark Friedrichs committed
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621

    return;

}

__device__ void loadElectrostaticShared( struct ElectrostaticParticle* sA, unsigned int atomI,
                                         float4* atomCoord, float* labFrameDipoleJ, float* labQuadrupole,
                                         float* inducedDipole, float* inducedDipolePolar, float2* dampingFactorAndThole )
{
    // coordinates & charge

    sA->x                        = atomCoord[atomI].x;
    sA->y                        = atomCoord[atomI].y;
    sA->z                        = atomCoord[atomI].z;
    sA->q                        = atomCoord[atomI].w;

    // lab dipole

622
623
624
    sA->labFrameDipole[0]         = labFrameDipoleJ[atomI*3];
    sA->labFrameDipole[1]         = labFrameDipoleJ[atomI*3+1];
    sA->labFrameDipole[2]         = labFrameDipoleJ[atomI*3+2];
Mark Friedrichs's avatar
Mark Friedrichs committed
625
626
627

    // lab quadrupole

628
629
630
631
632
633
634
635
636
    sA->labFrameQuadrupole[0]    = labQuadrupole[atomI*9];
    sA->labFrameQuadrupole[1]    = labQuadrupole[atomI*9+1];
    sA->labFrameQuadrupole[2]    = labQuadrupole[atomI*9+2];
    sA->labFrameQuadrupole[3]    = labQuadrupole[atomI*9+3];
    sA->labFrameQuadrupole[4]    = labQuadrupole[atomI*9+4];
    sA->labFrameQuadrupole[5]    = labQuadrupole[atomI*9+5];
    sA->labFrameQuadrupole[6]    = labQuadrupole[atomI*9+6];
    sA->labFrameQuadrupole[7]    = labQuadrupole[atomI*9+7];
    sA->labFrameQuadrupole[8]    = labQuadrupole[atomI*9+8];
Mark Friedrichs's avatar
Mark Friedrichs committed
637
638
639

    // induced dipole

640
641
642
    sA->inducedDipole[0]          = inducedDipole[atomI*3];
    sA->inducedDipole[1]          = inducedDipole[atomI*3+1];
    sA->inducedDipole[2]          = inducedDipole[atomI*3+2];
Mark Friedrichs's avatar
Mark Friedrichs committed
643
644
645

    // induced dipole polar

646
647
648
    sA->inducedDipoleP[0]         = inducedDipolePolar[atomI*3];
    sA->inducedDipoleP[1]         = inducedDipolePolar[atomI*3+1];
    sA->inducedDipoleP[2]         = inducedDipolePolar[atomI*3+2];
Mark Friedrichs's avatar
Mark Friedrichs committed
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672

    sA->damp                     = dampingFactorAndThole[atomI].x;
    sA->thole                    = dampingFactorAndThole[atomI].y;

}

// Include versions of the kernels for N^2 calculations.

#undef USE_OUTPUT_BUFFER_PER_WARP
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateAmoebaCudaElectrostatic.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaElectrostatic.h"

// reduce psWorkArray_3_1 -> force
// reduce psWorkArray_3_2 -> torque

static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
{
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
                               amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
673
    LAUNCHERROR("kReduceElectrostaticForce");
Mark Friedrichs's avatar
Mark Friedrichs committed
674
675
676
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
                               amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
677
    LAUNCHERROR("kReduceElectrostaticTorque");
Mark Friedrichs's avatar
Mark Friedrichs committed
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
}

#ifdef AMOEBA_DEBUG
static void printElectrostaticBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
    (void) fprintf( amoebaGpu->log, "Electrostatic Buffer %u\n", bufferIndex );
    unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms;
    unsigned int stop  = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms;
    for( unsigned int ii = start; ii < stop; ii += 3 ){
        unsigned int ii3Index      = ii/3;
        unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
        unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                            ii/3,  bufferIndex, particleIndex,
                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
                            amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
    } 

/*
    start = 0;
    stop  = -146016;
    float maxV = -1.0e+99;
    for( unsigned int ii = start; ii < stop; ii += 3 ){
        if(  amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] > maxV ){ 
            unsigned int ii3Index      = ii/3;
            unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
            unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
            (void) fprintf( amoebaGpu->log, "MaxQ %6u %3u %6u %14.6e\n", 
                            ii/3,  bufferIndex, particleIndex,
                            amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii] );
            maxV = amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii];
        } 
    } 
*/
}

static void printElectrostaticAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{
    (void) fprintf( amoebaGpu->log, "Electrostatic atom %u\n", targetAtom );
    for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
        unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                        ii, particleIndex,
                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
                        amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
                        amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
    } 
}
#endif

/**---------------------------------------------------------------------------------------

   Compute Amoeba electrostatic force & torque

   @param amoebaGpu        amoebaGpu context
   @param gpu              OpenMM gpu Cuda context

   --------------------------------------------------------------------------------------- */

void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu )
{
  
   // ---------------------------------------------------------------------------------------

Peter Eastman's avatar
Peter Eastman committed
749
750
    static unsigned int threadsPerBlock = 0;

Mark Friedrichs's avatar
Mark Friedrichs committed
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
#ifdef AMOEBA_DEBUG
    static const char* methodName = "cudaComputeAmoebaElectrostatic";
    static int timestep = 0;
    std::vector<int> fileId;
    timestep++;
    fileId.resize( 2 );
    fileId[0] = timestep;
    fileId[1] = 1;
#endif

    // ---------------------------------------------------------------------------------------

    gpuContext gpu = amoebaGpu->gpuContext;

    // apparently debug array can take up nontrivial no. registers

#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
Mark Friedrichs's avatar
Mark Friedrichs committed
769
770
        (void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
                        methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
Mark Friedrichs's avatar
Mark Friedrichs committed
771
772
773
774
775
776
777
778
    }   
   int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray            = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysStream[0],      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
    debugArray->Upload();
    unsigned int targetAtom                   = 0;
#endif

Peter Eastman's avatar
Peter Eastman committed
779
780
781
782
783
    // on first pass, set threads/block

    if( threadsPerBlock == 0 ){
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
Peter Eastman's avatar
Peter Eastman committed
784
            maxThreads = 384;
Peter Eastman's avatar
Peter Eastman committed
785
786
787
788
        else if (gpu->sm_version >= SM_12)
            maxThreads = 128;
        else
            maxThreads = 64;
Peter Eastman's avatar
Peter Eastman committed
789
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(ElectrostaticParticle)), maxThreads);
Peter Eastman's avatar
Peter Eastman committed
790
791
    }

Mark Friedrichs's avatar
Mark Friedrichs committed
792
793
794
795
    kClearFields_3( amoebaGpu, 2 );

    if (gpu->bOutputBufferPerWarp){

796
        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu Ebuf=%u ixnCt=%lu workUnits=%u\n",
Peter Eastman's avatar
Peter Eastman committed
797
798
                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                        sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
Mark Friedrichs's avatar
Mark Friedrichs committed
799
        (void) fflush( amoebaGpu->log );
Peter Eastman's avatar
Peter Eastman committed
800
        kCalculateAmoebaCudaElectrostaticN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
                                                                           gpu->psPosq4->_pDevStream[0],
                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
                                                                           amoebaGpu->psInducedDipole->_pDevStream[0],
                                                                           amoebaGpu->psInducedDipolePolar->_pDevStream[0],
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
                                                                           debugArray->_pDevStream[0], targetAtom );
#else
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
#endif

    } else {

#ifdef AMOEBA_DEBUG
        (void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaElectrostaticN2Forces no warp:  numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
Peter Eastman's avatar
Peter Eastman committed
819
820
                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                        sizeof(ElectrostaticParticle), sizeof(ElectrostaticParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
Mark Friedrichs's avatar
Mark Friedrichs committed
821
822
823
        (void) fflush( amoebaGpu->log );
#endif

Peter Eastman's avatar
Peter Eastman committed
824
        kCalculateAmoebaCudaElectrostaticN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(ElectrostaticParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
                                                                           amoebaGpu->psWorkUnit->_pDevStream[0],
                                                                           gpu->psPosq4->_pDevStream[0],
                                                                           amoebaGpu->psLabFrameDipole->_pDevStream[0],
                                                                           amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
                                                                           amoebaGpu->psInducedDipole->_pDevStream[0],
                                                                           amoebaGpu->psInducedDipolePolar->_pDevStream[0],
                                                                           amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0],
                                                                           debugArray->_pDevStream[0], targetAtom );
#else
                                                                           amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
#endif
    }
    LAUNCHERROR("kCalculateAmoebaCudaElectrostaticN2Forces");

#ifdef AMOEBA_DEBUG
    if( 0 && amoebaGpu->log ){

        amoebaGpu->psWorkArray_3_1->Download();
        amoebaGpu->psWorkArray_3_2->Download();

        printElectrostaticAtomBuffers( amoebaGpu, (targetAtom + 0) );
        //printElectrostaticAtomBuffers( amoebaGpu, (targetAtom + 1231) );
        printElectrostaticBuffer( amoebaGpu, 0 );
        //printElectrostaticBuffer( amoebaGpu, 38 );
    }
#endif

    kReduceForceTorque( amoebaGpu );

#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

        amoebaGpu->psForce->Download();
        amoebaGpu->psTorque->Download();
        debugArray->Download();

        (void) fprintf( amoebaGpu->log, "Finished Electrostatic kernel execution\n" ); (void) fflush( amoebaGpu->log );

        int maxPrint        = 1400;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 

            int indexOffset     = ii*3;
    
           // force

           (void) fprintf( amoebaGpu->log,"ElectrostaticF [%16.9e %16.9e %16.9e] ",
                           amoebaGpu->psForce->_pSysStream[0][indexOffset],
                           amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
                           amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
    
           // torque

           (void) fprintf( amoebaGpu->log,"ElectrostaticT [%16.9e %16.9e %16.9e] ",
                           amoebaGpu->psTorque->_pSysStream[0][indexOffset],
                           amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
                           amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );

           // coords

#if 0
            (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
                            gpu->psPosq4->_pSysStream[0][ii].x,
                            gpu->psPosq4->_pSysStream[0][ii].y,
                            gpu->psPosq4->_pSysStream[0][ii].z);


           for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
               int debugIndex = jj*gpu->natoms + ii;
               float xx       =  gpu->psPosq4->_pSysStream[0][jj].x -  gpu->psPosq4->_pSysStream[0][ii].x;
               float yy       =  gpu->psPosq4->_pSysStream[0][jj].y -  gpu->psPosq4->_pSysStream[0][ii].y;
               float zz       =  gpu->psPosq4->_pSysStream[0][jj].z -  gpu->psPosq4->_pSysStream[0][ii].z;
               (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
                               ii, jj, xx, yy, zz,
                               debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );

           }
#endif
           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
                ii = gpu->natoms - maxPrint;
           }
        }
        if( 1 ){
            (void) fprintf( amoebaGpu->log,"DebugElec\n" );
            int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
            for( int jj = 0; jj < gpu->natoms; jj++ ){
                int debugIndex = jj;
                for( int kk = 0; kk < 5; kk++ ){
                    (void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
                                    debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
                                    debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
                    debugIndex += paddedNumberOfAtoms;
                }
                (void) fprintf( amoebaGpu->log,"\n" );
            }
        }
        (void) fflush( amoebaGpu->log );

        if( 0 ){
            (void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
            int maxPrint = 12;
            for( int ii = 0; ii < gpu->natoms; ii++ ){
    
                // print cpu & gpu reductions
    
                int offset  = 3*ii;
    
                (void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
                                amoebaGpu->psForce->_pSysStream[0][offset],
                                amoebaGpu->psForce->_pSysStream[0][offset+1],
                                amoebaGpu->psForce->_pSysStream[0][offset+2],
                                amoebaGpu->psTorque->_pSysStream[0][offset],
                                amoebaGpu->psTorque->_pSysStream[0][offset+1],
                                amoebaGpu->psTorque->_pSysStream[0][offset+2] );
                if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint; 
            }   
        }   

        if( 1 ){
            std::vector<int> fileId;
            //fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;
Mark Friedrichs's avatar
Mark Friedrichs committed
950
951
952
            cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,            outputVector, NULL );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psForce,      outputVector, NULL );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psTorque,     outputVector, NULL);
Mark Friedrichs's avatar
Mark Friedrichs committed
953
954
955
956
957
958
959
960
961
962
            cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
         }

    }   
    delete debugArray;

#endif

   // ---------------------------------------------------------------------------------------
}