kCalculateAmoebaCudaMutualInducedAndGkFields.cu 45 KB
Newer Older
Mark Friedrichs's avatar
Mark Friedrichs committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
//-----------------------------------------------------------------------------------------

//-----------------------------------------------------------------------------------------

#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"

#include <stdio.h>

using namespace std;

static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

void SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}

void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}

//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG

39
40
41
42
43
#define GK
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef GK

__device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
Mark Friedrichs's avatar
Mark Friedrichs committed
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
                                                                 float fields[8][3]

#ifdef AMOEBA_DEBUG
               , float4* debugArray
#endif

 )
{

    float deltaR[3];
    
    // ---------------------------------------------------------------------------------------
    
    // get deltaR, and r between 2 atoms
    
59
60
61
    deltaR[0]                                    = atomJ.x - atomI.x;
    deltaR[1]                                    = atomJ.y - atomI.y;
    deltaR[2]                                    = atomJ.z - atomI.z;
Mark Friedrichs's avatar
Mark Friedrichs committed
62
63
64
65
66
67
68

    float r                                      =  sqrtf( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] );
    float rI                                     =  1.0f/r;
    float r2I                                    =  rI*rI;
    float rr3                                    = -rI*r2I;
    float rr5                                    = -3.0f*rr3*r2I;
    
69
    float dampProd                               = atomI.damp*atomJ.damp;
Mark Friedrichs's avatar
Mark Friedrichs committed
70
    float ratio                                  = (dampProd != 0.0f) ? (r/dampProd) : 1.0f;
71
    float pGamma                                 = atomI.thole > atomJ.thole ? atomJ.thole: atomI.thole;
Mark Friedrichs's avatar
Mark Friedrichs committed
72
    float damp                                   = ratio*ratio*ratio*pGamma;
73
    float dampExp                                = ( (dampProd != 0.0f) && (r < cAmoebaSim.scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f; 
Mark Friedrichs's avatar
Mark Friedrichs committed
74
75
76
77

    rr3                                         *= (1.0f - dampExp);
    rr5                                         *= (1.0f - ( 1.0f + damp )*dampExp);
        
78
79
80
81
    float dDotDelta                              = rr5*(deltaR[0]*atomJ.inducedDipole[0]    + deltaR[1]*atomJ.inducedDipole[1]    + deltaR[2]*atomJ.inducedDipole[2] );
    fields[0][0]                                 = rr3*atomJ.inducedDipole[0] + dDotDelta*deltaR[0];
    fields[0][1]                                 = rr3*atomJ.inducedDipole[1] + dDotDelta*deltaR[1];
    fields[0][2]                                 = rr3*atomJ.inducedDipole[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
82
   
83
84
85
86
    dDotDelta                                    = rr5*(deltaR[0]*atomJ.inducedDipolePolar[0]    + deltaR[1]*atomJ.inducedDipolePolar[1]    + deltaR[2]*atomJ.inducedDipolePolar[2] );
    fields[1][0]                                 = rr3*atomJ.inducedDipolePolar[0] + dDotDelta*deltaR[0];
    fields[1][1]                                 = rr3*atomJ.inducedDipolePolar[1] + dDotDelta*deltaR[1];
    fields[1][2]                                 = rr3*atomJ.inducedDipolePolar[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
87
  
88
89
90
91
    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipole[0]    + deltaR[1]*atomI.inducedDipole[1]    + deltaR[2]*atomI.inducedDipole[2] );
    fields[2][0]                                 = rr3*atomI.inducedDipole[0] + dDotDelta*deltaR[0];
    fields[2][1]                                 = rr3*atomI.inducedDipole[1] + dDotDelta*deltaR[1];
    fields[2][2]                                 = rr3*atomI.inducedDipole[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
92
   
93
94
95
96
97
98
99
100
101
    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipolePolar[0]    + deltaR[1]*atomI.inducedDipolePolar[1]    + deltaR[2]*atomI.inducedDipolePolar[2] );
    fields[3][0]                                 = rr3*atomI.inducedDipolePolar[0] + dDotDelta*deltaR[0];
    fields[3][1]                                 = rr3*atomI.inducedDipolePolar[1] + dDotDelta*deltaR[1];
    fields[3][2]                                 = rr3*atomI.inducedDipolePolar[2] + dDotDelta*deltaR[2];

    dDotDelta                                    = rr5*(deltaR[0]*atomJ.inducedDipoleS[0]    + deltaR[1]*atomJ.inducedDipoleS[1]    + deltaR[2]*atomJ.inducedDipoleS[2] );
    fields[4][0]                                 = rr3*atomJ.inducedDipoleS[0] + dDotDelta*deltaR[0];
    fields[4][1]                                 = rr3*atomJ.inducedDipoleS[1] + dDotDelta*deltaR[1];
    fields[4][2]                                 = rr3*atomJ.inducedDipoleS[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
102
   
103
104
105
106
    dDotDelta                                    = rr5*(deltaR[0]*atomJ.inducedDipolePolarS[0]    + deltaR[1]*atomJ.inducedDipolePolarS[1]    + deltaR[2]*atomJ.inducedDipolePolarS[2] );
    fields[5][0]                                 = rr3*atomJ.inducedDipolePolarS[0] + dDotDelta*deltaR[0];
    fields[5][1]                                 = rr3*atomJ.inducedDipolePolarS[1] + dDotDelta*deltaR[1];
    fields[5][2]                                 = rr3*atomJ.inducedDipolePolarS[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
107
  
108
109
110
111
    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipoleS[0]    + deltaR[1]*atomI.inducedDipoleS[1]    + deltaR[2]*atomI.inducedDipoleS[2] );
    fields[6][0]                                 = rr3*atomI.inducedDipoleS[0] + dDotDelta*deltaR[0];
    fields[6][1]                                 = rr3*atomI.inducedDipoleS[1] + dDotDelta*deltaR[1];
    fields[6][2]                                 = rr3*atomI.inducedDipoleS[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
112
   
113
114
115
116
    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipolePolarS[0]    + deltaR[1]*atomI.inducedDipolePolarS[1]    + deltaR[2]*atomI.inducedDipolePolarS[2] );
    fields[7][0]                                 = rr3*atomI.inducedDipolePolarS[0] + dDotDelta*deltaR[0];
    fields[7][1]                                 = rr3*atomI.inducedDipolePolarS[1] + dDotDelta*deltaR[1];
    fields[7][2]                                 = rr3*atomI.inducedDipolePolarS[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
117
118
119
120


}

121
__device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
Mark Friedrichs's avatar
Mark Friedrichs committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
                                                                   float gkField[8][3]

#ifdef AMOEBA_DEBUG
               , float4* debugArray
#endif

 )
{

    float gux[5];
    float guy[5];
    float guz[5];
    float a[3][3];
    
    // ---------------------------------------------------------------------------------------
    
138
139
140
    float xr               = atomJ.x - atomI.x;
    float yr               = atomJ.y - atomI.y;
    float zr               = atomJ.z - atomI.z;
Mark Friedrichs's avatar
Mark Friedrichs committed
141
142
143
144
145

    float xr2              = xr*xr;
    float yr2              = yr*yr;
    float zr2              = zr*zr;

146
147
    float rb2              = atomI.bornRadius*atomJ.bornRadius;

Mark Friedrichs's avatar
Mark Friedrichs committed
148
149
150
151
152
153
154
155
156
157
    float r2               = xr2 + yr2 + zr2;
    float expterm          = expf(-r2/(cAmoebaSim.gkc*rb2));
    float expc             = expterm /cAmoebaSim.gkc; 
    //float dexpc            = -2.0f / (cAmoebaSim.gkc*rb2);

    float gf2              = 1.0f / (r2+rb2*expterm);
    float gf               = sqrtf(gf2);
    float gf3              = gf2 * gf;
    float gf5              = gf3 * gf2;

158
159
160
    float duixs            = atomI.inducedDipoleS[0];
    float duiys            = atomI.inducedDipoleS[1];
    float duizs            = atomI.inducedDipoleS[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
161

162
163
164
    float puixs            = atomI.inducedDipolePolarS[0];
    float puiys            = atomI.inducedDipolePolarS[1];
    float puizs            = atomI.inducedDipolePolarS[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
165
 
166
167
168
    float dukxs            = atomJ.inducedDipoleS[0];
    float dukys            = atomJ.inducedDipoleS[1];
    float dukzs            = atomJ.inducedDipoleS[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
169

170
171
172
    float pukxs            = atomJ.inducedDipolePolarS[0];
    float pukys            = atomJ.inducedDipolePolarS[1];
    float pukzs            = atomJ.inducedDipolePolarS[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
 
    // reaction potential auxiliary terms
 
    a[1][0]                = -gf3;
    a[2][0]                = 3.0f * gf5;

    // reaction potential gradient auxiliary terms

    float expc1            = 1.0f - expc;
    a[1][1]                = expc1 * a[2][0];
 
    // unweighted dipole reaction potential gradient tensor

    gux[2]                 = cAmoebaSim.fd * (a[1][0] + xr2*a[1][1]);
    gux[3]                 = cAmoebaSim.fd * xr*yr*a[1][1];
    gux[4]                 = cAmoebaSim.fd * xr*zr*a[1][1];

    guy[2]                 = gux[3];
    guy[3]                 = cAmoebaSim.fd * (a[1][0] + yr2*a[1][1]);
    guy[4]                 = cAmoebaSim.fd * yr*zr*a[1][1];

    guz[2]                 = gux[4];
    guz[3]                 = guy[4];
    guz[4]                 = cAmoebaSim.fd * (a[1][0] + zr2*a[1][1]);
 
    gkField[0][0]          = dukxs*gux[2]+dukys*guy[2]+dukzs*guz[2];
    gkField[0][1]          = dukxs*gux[3]+dukys*guy[3]+dukzs*guz[3];
    gkField[0][2]          = dukxs*gux[4]+dukys*guy[4]+dukzs*guz[4];

    gkField[1][0]          = duixs*gux[2]+duiys*guy[2]+duizs*guz[2];
    gkField[1][1]          = duixs*gux[3]+duiys*guy[3]+duizs*guz[3];
    gkField[1][2]          = duixs*gux[4]+duiys*guy[4]+duizs*guz[4];

    gkField[2][0]          = pukxs*gux[2]+pukys*guy[2]+pukzs*guz[2];
    gkField[2][1]          = pukxs*gux[3]+pukys*guy[3]+pukzs*guz[3];
    gkField[2][2]          = pukxs*gux[4]+pukys*guy[4]+pukzs*guz[4];

    gkField[3][0]          = puixs*gux[2]+puiys*guy[2]+puizs*guz[2];
    gkField[3][1]          = puixs*gux[3]+puiys*guy[3]+puizs*guz[3];
    gkField[3][2]          = puixs*gux[4]+puiys*guy[4]+puizs*guz[4];

}

216
#ifdef AMOEBA_DEBUG
217
__device__ static int debugAccumulate( int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
Mark Friedrichs's avatar
Mark Friedrichs committed
218
219
220
221
222
223
224
225
226
{
    index                             += cAmoebaSim.paddedNumberOfAtoms;
    debugArray[index].x                = addMask ? field[0] : 0.0f;
    debugArray[index].y                = addMask ? field[1] : 0.0f;
    debugArray[index].z                = addMask ? field[2] : 0.0f;
    debugArray[index].w                = idLabel;

    return index;
}
227
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
228
229
230
231
232
233
234
235
236
237
238
239
240
241


// Include versions of the kernels for N^2 calculations.

#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateAmoebaCudaMutualInducedAndGkFields.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaMutualInducedAndGkFields.h"

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
242
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kInitializeMutualInducedAndGkField_kernel(
                   float* fixedEField,
                   float* fixedEFieldPolar,
                   float* fixedGkField,
                   float* polarizability,
                   float* inducedDipole,
                   float* inducedDipolePolar,
                   float* inducedDipoleS,
                   float* inducedDipolePolarS )
{

    int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
    if( threadId >= 3*cAmoebaSim.numberOfAtoms )return;

    fixedEField[threadId]          *= polarizability[threadId];
    inducedDipole[threadId]         = fixedEField[threadId];

    fixedEFieldPolar[threadId]     *= polarizability[threadId];
    inducedDipolePolar[threadId]    = fixedEFieldPolar[threadId];

    fixedGkField[threadId]         *= polarizability[threadId];
    inducedDipoleS[threadId]        = fixedEField[threadId]       + fixedGkField[threadId];
    inducedDipolePolarS[threadId]   = fixedEFieldPolar[threadId]  + fixedGkField[threadId];

}

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
276
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* arrayOfDeltas2,
                                                 float* arrayOfDeltas3, float* arrayOfDeltas4, float* epsilon )
{
    extern __shared__ float4 delta[];

    delta[threadIdx.x].x    = 0.0f;
    delta[threadIdx.x].y    = 0.0f;
    delta[threadIdx.x].z    = 0.0f;
    delta[threadIdx.x].w    = 0.0f;

    unsigned int pos        = threadIdx.x;

    // load deltas

295
    while( pos < 3*cAmoebaSim.numberOfAtoms )
Mark Friedrichs's avatar
Mark Friedrichs committed
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
    {   
        delta[threadIdx.x].x  += arrayOfDeltas1[pos];
        delta[threadIdx.x].y  += arrayOfDeltas2[pos];
        delta[threadIdx.x].z  += arrayOfDeltas3[pos];
        delta[threadIdx.x].w  += arrayOfDeltas4[pos];
        pos                   += blockDim.x*gridDim.x;
    }   
    __syncthreads();

    // sum the deltas

    for (int offset = 1; offset < blockDim.x; offset *= 2 )
    {   
        if (threadIdx.x + offset < blockDim.x && (threadIdx.x & (2*offset-1)) == 0)
        {
            delta[threadIdx.x].x   += delta[threadIdx.x+offset].x;
            delta[threadIdx.x].y   += delta[threadIdx.x+offset].y;
            delta[threadIdx.x].z   += delta[threadIdx.x+offset].z;
            delta[threadIdx.x].w   += delta[threadIdx.x+offset].w;
        }
        __syncthreads();
    }   

    // set epsilons

    if (threadIdx.x == 0)
    {   
        epsilon[0]  = delta[0].x;
        epsilon[0]  = epsilon[0] < delta[0].y ? delta[0].y : epsilon[0];
        epsilon[0]  = epsilon[0] < delta[0].z ? delta[0].z : epsilon[0];
        epsilon[0]  = epsilon[0] < delta[0].w ? delta[0].w : epsilon[0];
327
        epsilon[0]  = 48.033324f*sqrtf( epsilon[0]/( (float) cAmoebaSim.numberOfAtoms ) );
Mark Friedrichs's avatar
Mark Friedrichs committed
328
#ifdef AMOEBA_DEBUG
329
330
331
332
        epsilon[1]  = 48.033324f*sqrtf( delta[0].x/( (float) cAmoebaSim.numberOfAtoms ) );
        epsilon[2]  = 48.033324f*sqrtf( delta[0].y/( (float) cAmoebaSim.numberOfAtoms ) );
        epsilon[3]  = 48.033324f*sqrtf( delta[0].z/( (float) cAmoebaSim.numberOfAtoms ) );
        epsilon[4]  = 48.033324f*sqrtf( delta[0].w/( (float) cAmoebaSim.numberOfAtoms ) );
Mark Friedrichs's avatar
Mark Friedrichs committed
333
334
335
336
337
338
339
340
341
342
343
344
#endif
    }   
}

/**

   matrixProduct/matrixProductP contains epsilon**2 on output

*/
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
345
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kSorUpdateMutualInducedAndGkField_kernel(
                   float* polarizability,
                   float* inducedDipole, float* inducedDipoleP,
                   float* fixedEField,   float* fixedEFieldP,
                   float* matrixProduct, float* matrixProductP )
{

    float polarSOR = 0.70f;
    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
    if( threadId  >= 3*cAmoebaSim.numberOfAtoms)return;

    float previousDipole                = inducedDipole[threadId];
    float previousDipoleP               = inducedDipoleP[threadId];

    inducedDipole[threadId]             = fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
    inducedDipoleP[threadId]            = fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];

    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );

    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );

}

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
378
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kSorUpdateMutualInducedAndGkFieldS_kernel(
                   float* polarizability,
                   float* inducedDipole, float* inducedDipoleP,
                   float* fixedEField,   float* fixedEFieldP,
                   float* fixedGkField,
                   float* matrixProduct, float* matrixProductP )
{

    float polarSOR = 0.70f;
    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
    if( threadId  >= 3*cAmoebaSim.numberOfAtoms)return;

    float previousDipole                = inducedDipole[threadId];
    float previousDipoleP               = inducedDipoleP[threadId];

    inducedDipole[threadId]             = fixedGkField[threadId]    + fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
    inducedDipoleP[threadId]            = fixedGkField[threadId]    + fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];

    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );

    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );

}

// reduce psWorkArray_3_1 -> outputArray
// reduce psWorkArray_3_2 -> outputPolarArray
// reduce psWorkArray_3_3 -> outputArrayS
// reduce psWorkArray_3_4 -> outputPolarArrayS

static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
                                            CUDAStream<float>* outputArray,  CUDAStream<float>* outputPolarArray,
                                            CUDAStream<float>* outputArrayS, CUDAStream<float>* outputPolarArrayS )
{
    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
Mark Friedrichs's avatar
Mark Friedrichs committed
420
                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
421
422
423
424
    LAUNCHERROR("kReduceMutualInducedAndGkFields1");

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
Mark Friedrichs's avatar
Mark Friedrichs committed
425
                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
426
427
428
429
    LAUNCHERROR("kReduceMutualInducedAndGkFields2");

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
Mark Friedrichs's avatar
Mark Friedrichs committed
430
                               amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
431
432
433
434
    LAUNCHERROR("kReduceMutualInducedAndGkFields3");

    kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                               amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
Mark Friedrichs's avatar
Mark Friedrichs committed
435
                               amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
    LAUNCHERROR("kReduceMutualInducedAndGkFields4");
}

#ifdef AMOEBA_DEBUG
#if 0
static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
    (void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
    unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms;
    unsigned int stop  = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms;
    for( unsigned int ii = start; ii < stop; ii += 3 ){
        unsigned int ii3Index      = ii/3;
        unsigned int bufferIndex   = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
        unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                            ii/3,  bufferIndex, particleIndex,
Mark Friedrichs's avatar
Mark Friedrichs committed
452
453
454
455
456
457
                            amoebaGpu->psWorkArray_3_1->_pSysData[ii],
                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
                            amoebaGpu->psWorkArray_3_2->_pSysData[ii],
                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
458
459
460
461
462
463
464
465
466
467
    } 
}

static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{
    (void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
    for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
        unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                        ii, particleIndex,
Mark Friedrichs's avatar
Mark Friedrichs committed
468
469
470
471
472
473
                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
    } 
}
#endif
#endif

/**---------------------------------------------------------------------------------------

   Compute mutual induce field

   @param amoebaGpu        amoebaGpu context

   --------------------------------------------------------------------------------------- */

static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuContext amoebaGpu,
                                                                    CUDAStream<float>* outputArray,  CUDAStream<float>* outputPolarArray,
                                                                    CUDAStream<float>* outputArrayS, CUDAStream<float>* outputPolarArrayS )
{
  
   // ---------------------------------------------------------------------------------------

    static unsigned int threadsPerBlock = 0;

   // ---------------------------------------------------------------------------------------

  gpuContext gpu    = amoebaGpu->gpuContext;

#ifdef AMOEBA_DEBUG
    static int iteration = 1;
    int targetAtom    = 0;
    static const char* methodName       = "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply";
    if( 1 && amoebaGpu->log ){
505
        (void) fprintf( amoebaGpu->log, "%s\n", methodName );
Mark Friedrichs's avatar
Mark Friedrichs committed
506
507
508
509
        (void) fflush( amoebaGpu->log );
    }
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
Mark Friedrichs's avatar
Mark Friedrichs committed
510
    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
Mark Friedrichs's avatar
Mark Friedrichs committed
511
512
513
514
515
516
517
518
519
520
    debugArray->Upload();
#endif

    // clear output arrays

    kClearFields_3( amoebaGpu, 4 );

    // set threads/block first time through

    if( threadsPerBlock == 0 ){
521
522
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
Peter Eastman's avatar
Peter Eastman committed
523
            maxThreads = 384;
524
525
526
527
528
        else if (gpu->sm_version >= SM_12)
            maxThreads = 128;
        else
            maxThreads = 64;
        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
Mark Friedrichs's avatar
Mark Friedrichs committed
529
530
531
    }
    
    if (gpu->bOutputBufferPerWarp){
532
        kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
533
534
535
536
                                                                 amoebaGpu->psWorkUnit->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_3->_pDevData,
537
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
538
539
                                                                 amoebaGpu->psWorkArray_3_4->_pDevData,
                                                                 debugArray->_pDevData, targetAtom );
Mark Friedrichs's avatar
Mark Friedrichs committed
540
#else
Mark Friedrichs's avatar
Mark Friedrichs committed
541
                                                                 amoebaGpu->psWorkArray_3_4->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
542
543
544
545
546
#endif
    } else {

#ifdef AMOEBA_DEBUG
        (void) fprintf( amoebaGpu->log, "N2 no warp\n" );
547
        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
548
549
                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
550
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
Mark Friedrichs's avatar
Mark Friedrichs committed
551
552
553
554
        (void) fflush( amoebaGpu->log );
#endif

        kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
555
556
557
558
                                                            amoebaGpu->psWorkUnit->_pDevData,
                                                            amoebaGpu->psWorkArray_3_1->_pDevData,
                                                            amoebaGpu->psWorkArray_3_2->_pDevData,
                                                            amoebaGpu->psWorkArray_3_3->_pDevData,
Mark Friedrichs's avatar
Mark Friedrichs committed
559
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
560
561
                                                            amoebaGpu->psWorkArray_3_4->_pDevData,
                                                            debugArray->_pDevData, targetAtom );
Mark Friedrichs's avatar
Mark Friedrichs committed
562
#else
Mark Friedrichs's avatar
Mark Friedrichs committed
563
                                                            amoebaGpu->psWorkArray_3_4->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
#endif

    }
    LAUNCHERROR("kCalculateAmoebaMutualInducedAndGkFields");  

    kReduceMutualInducedAndGkFields( amoebaGpu, outputArray, outputPolarArray, outputArrayS, outputPolarArrayS );

#ifdef AMOEBA_DEBUG
        amoebaGpu->psWorkArray_3_1->Download();
        amoebaGpu->psWorkArray_3_2->Download();
        amoebaGpu->psWorkArray_3_3->Download();
        amoebaGpu->psWorkArray_3_4->Download();

        //printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
        //printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
        //printMiFieldAtomBuffers( amoebaGpu, 100 );
        //printMiFieldBuffer( amoebaGpu, 0 );
        //printMiFieldBuffer( amoebaGpu, 1 );
        //printMiFieldBuffer( amoebaGpu, 37 );
        //printMiFieldBuffer( amoebaGpu, 38 );

585
    if( amoebaGpu->log && iteration == 1 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605

        (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );

        outputArray->Download();
        outputPolarArray->Download();

        outputArrayS->Download();
        outputPolarArrayS->Download();

        debugArray->Download();

        int maxPrint        = 33;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 

            int indexOffset     = ii*3;
    
           // Mi

           (void) fprintf( amoebaGpu->log," Mult[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
606
607
608
                           outputArray->_pSysData[indexOffset],
                           outputArray->_pSysData[indexOffset+1],
                           outputArray->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
609
610
611
612
    
           // Mi polar

           (void) fprintf( amoebaGpu->log," MultP[%16.9e %16.9e %16.9e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
613
614
615
                           outputPolarArray->_pSysData[indexOffset],
                           outputPolarArray->_pSysData[indexOffset+1],
                           outputPolarArray->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
616
617
618
619

           // MiS

           (void) fprintf( amoebaGpu->log,"      MultS[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
620
621
622
                           outputArrayS->_pSysData[indexOffset],
                           outputArrayS->_pSysData[indexOffset+1],
                           outputArrayS->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
623
624
625
626
    
           // Mi polarS

           (void) fprintf( amoebaGpu->log,"MultPS[%16.9e %16.9e %16.9e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
627
628
629
                           outputPolarArrayS->_pSysData[indexOffset],
                           outputPolarArrayS->_pSysData[indexOffset+1],
                           outputPolarArrayS->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
630
631
632
633
634

           // coords

#if 0
            (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
635
636
637
                            gpu->psPosq4->_pSysData[ii].x,
                            gpu->psPosq4->_pSysData[ii].y,
                            gpu->psPosq4->_pSysData[ii].z);
Mark Friedrichs's avatar
Mark Friedrichs committed
638
639
640
641


           for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
               int debugIndex = jj*gpu->natoms + ii;
Mark Friedrichs's avatar
Mark Friedrichs committed
642
643
644
               float xx       =  gpu->psPosq4->_pSysData[jj].x -  gpu->psPosq4->_pSysData[ii].x;
               float yy       =  gpu->psPosq4->_pSysData[jj].y -  gpu->psPosq4->_pSysData[ii].y;
               float zz       =  gpu->psPosq4->_pSysData[jj].z -  gpu->psPosq4->_pSysData[ii].z;
Mark Friedrichs's avatar
Mark Friedrichs committed
645
646
               (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
                               ii, jj, xx, yy, zz,
Mark Friedrichs's avatar
Mark Friedrichs committed
647
                               debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
Mark Friedrichs's avatar
Mark Friedrichs committed
648
649
650
651
652
653
654
655
656
657

           }
#endif
           if( ii == targetAtom ){
               (void) fprintf( amoebaGpu->log,"\n" );
               int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
               for( int jj = 0; jj < gpu->natoms; jj++ ){
                   int debugIndex = jj;
                   (void) fprintf( amoebaGpu->log,"%4d %4d Rint [%16.9e %16.9e %16.9e %16.9e]\n",
                                   ii, jj,
Mark Friedrichs's avatar
Mark Friedrichs committed
658
659
                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
Mark Friedrichs's avatar
Mark Friedrichs committed
660
661
662
663

                   for( int kk = 0; kk < 9; kk++ ){
                       debugIndex += paddedNumberOfAtoms;
                       (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %5.1f]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
664
                                       debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
Mark Friedrichs's avatar
Mark Friedrichs committed
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
                   }
               }
               (void) fprintf( amoebaGpu->log,"\n" );
           }
           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
                ii = gpu->natoms - maxPrint;
           }
        }
        (void) fflush( amoebaGpu->log );
        iteration++;

     }
     delete debugArray;
#endif

}

/**---------------------------------------------------------------------------------------

   Compute mutual induce field

   @param amoebaGpu        amoebaGpu context

   --------------------------------------------------------------------------------------- */

static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoebaGpu )
{
  
   // ---------------------------------------------------------------------------------------

696
    static int timestep                  = 0;
Mark Friedrichs's avatar
Mark Friedrichs committed
697
    timestep++;
Mark Friedrichs's avatar
Update  
Mark Friedrichs committed
698
#ifdef AMOEBA_DEBUG
699
    static const char* methodName        = "cudaComputeAmoebaMutualInducedAndGkFieldBySOR";
Mark Friedrichs's avatar
Update  
Mark Friedrichs committed
700
    std::vector<int> fileId;
Mark Friedrichs's avatar
Mark Friedrichs committed
701
702
703
704
705
706
707
708
709
710
    fileId.resize( 2 );
    fileId[0] = timestep;
    fileId[1] = 1;
#endif

   // ---------------------------------------------------------------------------------------

    int done;
    int iteration;

711
    gpuContext gpu    = amoebaGpu->gpuContext;
Mark Friedrichs's avatar
Mark Friedrichs committed
712
713
714
715
716
717
718
    int numOfElems     = gpu->natoms*3;
    int numThreads     = min( THREADS_PER_BLOCK, numOfElems );
    int numBlocks      = numOfElems/numThreads;

    if( (numOfElems % numThreads) != 0 )numBlocks++;

#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Update  
Mark Friedrichs committed
719
    if( amoebaGpu->log && timestep == 1 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
720
721
722
723
724
725
726
727
728
729
730
731
732
733
        (void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
                        "maxIterations=%d targetEpsilon=%.3e\n", 
                        methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
                        amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
        (void) fflush( amoebaGpu->log );
    }   
#endif

   // ---------------------------------------------------------------------------------------

    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability

    kInitializeMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
734
735
736
737
738
739
740
741
         amoebaGpu->psE_Field->_pDevData,
         amoebaGpu->psE_FieldPolar->_pDevData,
         amoebaGpu->psGk_Field->_pDevData,
         amoebaGpu->psPolarizability->_pDevData,
         amoebaGpu->psInducedDipole->_pDevData,
         amoebaGpu->psInducedDipolePolar->_pDevData,
         amoebaGpu->psInducedDipoleS->_pDevData,
         amoebaGpu->psInducedDipolePolarS->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
    LAUNCHERROR("kInitializeMutualInducedAndGkField");  

#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

        amoebaGpu->psE_Field->Download();
        amoebaGpu->psE_FieldPolar->Download();
        amoebaGpu->psInducedDipole->Download(),
        amoebaGpu->psInducedDipolePolar->Download();
        amoebaGpu->psInducedDipoleS->Download(),
        amoebaGpu->psInducedDipolePolarS->Download();
        amoebaGpu->psPolarizability->Download();

        (void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
        int offset   = 0;
        int maxPrint = 10;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
            (void) fprintf( amoebaGpu->log, "\n%4d pol=%12.4e\n", ii, 
Mark Friedrichs's avatar
Mark Friedrichs committed
760
761
762
763
                            amoebaGpu->psPolarizability->_pSysData[offset] );
            if( amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+1] ||
                amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+2] ){
                (void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] ); 
Mark Friedrichs's avatar
Mark Friedrichs committed
764
765
766
            }

            (void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e]  Mi[%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
767
768
                            amoebaGpu->psE_Field->_pSysData[offset],       amoebaGpu->psE_Field->_pSysData[offset+1],       amoebaGpu->psE_Field->_pSysData[offset+2],
                            amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
769
            (void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
770
771
                            amoebaGpu->psE_FieldPolar->_pSysData[offset],       amoebaGpu->psE_FieldPolar->_pSysData[offset+1],       amoebaGpu->psE_FieldPolar->_pSysData[offset+2],
                            amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
772
773

            (void) fprintf( amoebaGpu->log,"Gk[%14.6e %14.6e %14.6e] MiS[%14.6e %14.6e %14.6e] MipS[%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
774
775
776
                            amoebaGpu->psGk_Field->_pSysData[offset],            amoebaGpu->psGk_Field->_pSysData[offset+1],            amoebaGpu->psGk_Field->_pSysData[offset+2],
                            amoebaGpu->psInducedDipoleS->_pSysData[offset],      amoebaGpu->psInducedDipoleS->_pSysData[offset+1],      amoebaGpu->psInducedDipoleS->_pSysData[offset+2],
                            amoebaGpu->psInducedDipolePolarS->_pSysData[offset], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+1], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
            offset += 3;
            if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii =  (gpu->natoms - maxPrint);
        }   
        (void) fflush( amoebaGpu->log );
    }   
#endif

    // ---------------------------------------------------------------------------------------
 
    done      = 0;
    iteration = 1;

    while( !done ){

        // matrix multiply

        cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpu,
                                                                amoebaGpu->psWorkVector[0],  amoebaGpu->psWorkVector[1],
                                                                amoebaGpu->psWorkVector[2],  amoebaGpu->psWorkVector[3] );

        LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply");  

    // ---------------------------------------------------------------------------------------

        // post matrix multiply

        kSorUpdateMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
804
805
806
807
           amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipole->_pDevData,     amoebaGpu->psInducedDipolePolar->_pDevData,
           amoebaGpu->psE_Field->_pDevData,           amoebaGpu->psE_FieldPolar->_pDevData,
           amoebaGpu->psWorkVector[0]->_pDevData,     amoebaGpu->psWorkVector[1]->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
808
809
810
        LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate1");  

        kSorUpdateMutualInducedAndGkFieldS_kernel<<< numBlocks, numThreads >>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
811
812
813
814
815
           amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipoleS->_pDevData,    amoebaGpu->psInducedDipolePolarS->_pDevData,
           amoebaGpu->psE_Field->_pDevData,          amoebaGpu->psE_FieldPolar->_pDevData,
           amoebaGpu->psGk_Field->_pDevData,
           amoebaGpu->psWorkVector[2]->_pDevData,     amoebaGpu->psWorkVector[3]->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
816
817
818
819
820
        LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate2");  

        // get total epsilon -- performing sums on gpu

        kReduceMutualInducedAndGkFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 4*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
821
822
823
           amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData,
           amoebaGpu->psWorkVector[2]->_pDevData, amoebaGpu->psWorkVector[3]->_pDevData,
           amoebaGpu->psCurrentEpsilon->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
824
825
826
827
828
829
830
831
832
        LAUNCHERROR("kReduceMutualInducedAndGkFieldDelta_kernel");

#if 0
        // get total epsilon -- performing sums on cpu
{
        float sum[4];
        float currentEpsilon = -1.0e30;
        for( int ii = 0; ii < 4; ii++ ){
            sum[ii]                  = cudaGetSum( 3*gpu->natoms, amoebaGpu->psWorkVector[ii]);
833
            sum[ii]                  = 48.033324f*sqrtf( sum[ii]/( (float) gpu->natoms) );
Mark Friedrichs's avatar
Mark Friedrichs committed
834
835
836
837
838
839
840
841
            if( sum[ii] > currentEpsilon ){
                currentEpsilon = sum[ii];
            }
        }

        amoebaGpu->mutualInducedCurrentEpsilon  = currentEpsilon;
        (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d sums=%14.6e %14.6e %14.6e %14.6e\n",
                        methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
Mark Friedrichs's avatar
Mark Friedrichs committed
842
843
                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done, sum[0], sum[1], sum[2], sum[3] );
Mark Friedrichs's avatar
Mark Friedrichs committed
844
845
846
}
#endif

847
        // Debye=48.033324f
848

Mark Friedrichs's avatar
Mark Friedrichs committed
849
        amoebaGpu->psCurrentEpsilon->Download();
Mark Friedrichs's avatar
Mark Friedrichs committed
850
        float currentEpsilon                     = amoebaGpu->psCurrentEpsilon->_pSysData[0];
Mark Friedrichs's avatar
Mark Friedrichs committed
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
        amoebaGpu->mutualInducedCurrentEpsilon   = currentEpsilon;

        if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){ 
            done = 1;
        }

#ifdef AMOEBA_DEBUG
        if( amoebaGpu->log ){
           amoebaGpu->psInducedDipole->Download();
           amoebaGpu->psInducedDipolePolar->Download();
           amoebaGpu->psInducedDipoleS->Download();
           amoebaGpu->psInducedDipolePolarS->Download();
           amoebaGpu->psWorkVector[2]->Download();
           amoebaGpu->psWorkVector[3]->Download();
           amoebaGpu->psGk_Field->Download();

#if 1
           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
                           methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
Mark Friedrichs's avatar
Mark Friedrichs committed
870
871
                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
Mark Friedrichs's avatar
Mark Friedrichs committed
872
873
874
#else
           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
                           methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
Mark Friedrichs's avatar
Mark Friedrichs committed
875
876
877
                           amoebaGpu->psCurrentEpsilon->_pSysData[0], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
Mark Friedrichs's avatar
Mark Friedrichs committed
878
879
880
881
882
883
884
885
886
#endif
           (void) fflush( amoebaGpu->log );

            int offset   = 0;
            int maxPrint = 20;
            for( int ii = 0; ii < gpu->natoms; ii++ ){
                (void) fprintf( amoebaGpu->log, "%4d ", ii ); 
    
                (void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
887
                                amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
888
                (void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]",
Mark Friedrichs's avatar
Mark Friedrichs committed
889
                                amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
890
                (void) fprintf( amoebaGpu->log," MiS[%14.6e %14.6e %14.6e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
891
                                amoebaGpu->psInducedDipoleS->_pSysData[offset], amoebaGpu->psInducedDipoleS->_pSysData[offset+1], amoebaGpu->psInducedDipoleS->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
892
                (void) fprintf( amoebaGpu->log,"MipS[%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
893
                                amoebaGpu->psInducedDipolePolarS->_pSysData[offset], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+1], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
894
895
/*
                (void) fprintf( amoebaGpu->log,"Gk [%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
896
                                amoebaGpu->psGk_Field->_pSysData[offset], amoebaGpu->psGk_Field->_pSysData[offset+1], amoebaGpu->psGk_Field->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
897
                (void) fprintf( amoebaGpu->log,"W2 [%14.6e %14.6e %14.6e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
898
                                amoebaGpu->psWorkVector[2]->_pSysData[offset], amoebaGpu->psWorkVector[2]->_pSysData[offset+1], amoebaGpu->psWorkVector[2]->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
899
                (void) fprintf( amoebaGpu->log,"W3 [%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
900
                                amoebaGpu->psWorkVector[3]->_pSysData[offset], amoebaGpu->psWorkVector[3]->_pSysData[offset+1], amoebaGpu->psWorkVector[3]->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
*/
                if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
                    ii =  (gpu->natoms - maxPrint);
                    offset = 3*(ii+1);
                } else {
                    offset += 3;
                }
            }   
            (void) fflush( amoebaGpu->log );
        }
#endif
        iteration++;
    }

    amoebaGpu->mutualInducedDone             = done;
    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;

Mark Friedrichs's avatar
Update  
Mark Friedrichs committed
918
    if( amoebaGpu->log ){
919
        trackMutualInducedIterations( amoebaGpu, iteration );
Mark Friedrichs's avatar
Update  
Mark Friedrichs committed
920
921
    }

Mark Friedrichs's avatar
Mark Friedrichs committed
922
#ifdef AMOEBA_DEBUG
923
    if( 0 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
924
925
926
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
927
928
929
930
931
932
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, NULL, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,       outputVector, NULL, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar,  outputVector, NULL, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipoleS,      outputVector, NULL, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolarS, outputVector, NULL, 1.0f );
        cudaWriteVectorOfDoubleVectorsToFile( "Cuda_GK_MI", fileId, outputVector );
Mark Friedrichs's avatar
Mark Friedrichs committed
933
934
935
936
937
938
939
940
941
942
943
944
     }
#endif

   // ---------------------------------------------------------------------------------------
}

void cudaComputeAmoebaMutualInducedAndGkField( amoebaGpuContext amoebaGpu )
{
    if( amoebaGpu->mutualInducedIterativeMethod == 0 ){
        cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpu );
    }
}