kCalculateAmoebaCudaMutualInducedAndGkFields.cu 45.1 KB
Newer Older
Mark Friedrichs's avatar
Mark Friedrichs committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
//-----------------------------------------------------------------------------------------

//-----------------------------------------------------------------------------------------

#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"

#include <stdio.h>

using namespace std;

static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

void SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}

void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaCudaMutualInducedAndGkFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}

Mark Friedrichs's avatar
Mark Friedrichs committed
36
37
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
38

39
40
41
42
43
#define GK
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef GK

__device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
Mark Friedrichs's avatar
Mark Friedrichs committed
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
                                                                 float fields[8][3]

#ifdef AMOEBA_DEBUG
               , float4* debugArray
#endif

 )
{

    float deltaR[3];
    
    // ---------------------------------------------------------------------------------------
    
    // get deltaR, and r between 2 atoms
    
59
60
61
    deltaR[0]                                    = atomJ.x - atomI.x;
    deltaR[1]                                    = atomJ.y - atomI.y;
    deltaR[2]                                    = atomJ.z - atomI.z;
Mark Friedrichs's avatar
Mark Friedrichs committed
62
63
64
65
66
67
68

    float r                                      =  sqrtf( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] );
    float rI                                     =  1.0f/r;
    float r2I                                    =  rI*rI;
    float rr3                                    = -rI*r2I;
    float rr5                                    = -3.0f*rr3*r2I;
    
69
    float dampProd                               = atomI.damp*atomJ.damp;
Mark Friedrichs's avatar
Mark Friedrichs committed
70
    float ratio                                  = (dampProd != 0.0f) ? (r/dampProd) : 1.0f;
71
    float pGamma                                 = atomI.thole > atomJ.thole ? atomJ.thole: atomI.thole;
Mark Friedrichs's avatar
Mark Friedrichs committed
72
    float damp                                   = ratio*ratio*ratio*pGamma;
73
    float dampExp                                = ( (dampProd != 0.0f) && (r < cAmoebaSim.scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f; 
Mark Friedrichs's avatar
Mark Friedrichs committed
74
75
76
77

    rr3                                         *= (1.0f - dampExp);
    rr5                                         *= (1.0f - ( 1.0f + damp )*dampExp);
        
78
79
80
81
    float dDotDelta                              = rr5*(deltaR[0]*atomJ.inducedDipole[0]    + deltaR[1]*atomJ.inducedDipole[1]    + deltaR[2]*atomJ.inducedDipole[2] );
    fields[0][0]                                 = rr3*atomJ.inducedDipole[0] + dDotDelta*deltaR[0];
    fields[0][1]                                 = rr3*atomJ.inducedDipole[1] + dDotDelta*deltaR[1];
    fields[0][2]                                 = rr3*atomJ.inducedDipole[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
82
   
83
84
85
86
    dDotDelta                                    = rr5*(deltaR[0]*atomJ.inducedDipolePolar[0]    + deltaR[1]*atomJ.inducedDipolePolar[1]    + deltaR[2]*atomJ.inducedDipolePolar[2] );
    fields[1][0]                                 = rr3*atomJ.inducedDipolePolar[0] + dDotDelta*deltaR[0];
    fields[1][1]                                 = rr3*atomJ.inducedDipolePolar[1] + dDotDelta*deltaR[1];
    fields[1][2]                                 = rr3*atomJ.inducedDipolePolar[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
87
  
88
89
90
91
    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipole[0]    + deltaR[1]*atomI.inducedDipole[1]    + deltaR[2]*atomI.inducedDipole[2] );
    fields[2][0]                                 = rr3*atomI.inducedDipole[0] + dDotDelta*deltaR[0];
    fields[2][1]                                 = rr3*atomI.inducedDipole[1] + dDotDelta*deltaR[1];
    fields[2][2]                                 = rr3*atomI.inducedDipole[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
92
   
93
94
95
96
97
98
99
100
101
    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipolePolar[0]    + deltaR[1]*atomI.inducedDipolePolar[1]    + deltaR[2]*atomI.inducedDipolePolar[2] );
    fields[3][0]                                 = rr3*atomI.inducedDipolePolar[0] + dDotDelta*deltaR[0];
    fields[3][1]                                 = rr3*atomI.inducedDipolePolar[1] + dDotDelta*deltaR[1];
    fields[3][2]                                 = rr3*atomI.inducedDipolePolar[2] + dDotDelta*deltaR[2];

    dDotDelta                                    = rr5*(deltaR[0]*atomJ.inducedDipoleS[0]    + deltaR[1]*atomJ.inducedDipoleS[1]    + deltaR[2]*atomJ.inducedDipoleS[2] );
    fields[4][0]                                 = rr3*atomJ.inducedDipoleS[0] + dDotDelta*deltaR[0];
    fields[4][1]                                 = rr3*atomJ.inducedDipoleS[1] + dDotDelta*deltaR[1];
    fields[4][2]                                 = rr3*atomJ.inducedDipoleS[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
102
   
103
104
105
106
    dDotDelta                                    = rr5*(deltaR[0]*atomJ.inducedDipolePolarS[0]    + deltaR[1]*atomJ.inducedDipolePolarS[1]    + deltaR[2]*atomJ.inducedDipolePolarS[2] );
    fields[5][0]                                 = rr3*atomJ.inducedDipolePolarS[0] + dDotDelta*deltaR[0];
    fields[5][1]                                 = rr3*atomJ.inducedDipolePolarS[1] + dDotDelta*deltaR[1];
    fields[5][2]                                 = rr3*atomJ.inducedDipolePolarS[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
107
  
108
109
110
111
    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipoleS[0]    + deltaR[1]*atomI.inducedDipoleS[1]    + deltaR[2]*atomI.inducedDipoleS[2] );
    fields[6][0]                                 = rr3*atomI.inducedDipoleS[0] + dDotDelta*deltaR[0];
    fields[6][1]                                 = rr3*atomI.inducedDipoleS[1] + dDotDelta*deltaR[1];
    fields[6][2]                                 = rr3*atomI.inducedDipoleS[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
112
   
113
114
115
116
    dDotDelta                                    = rr5*(deltaR[0]*atomI.inducedDipolePolarS[0]    + deltaR[1]*atomI.inducedDipolePolarS[1]    + deltaR[2]*atomI.inducedDipolePolarS[2] );
    fields[7][0]                                 = rr3*atomI.inducedDipolePolarS[0] + dDotDelta*deltaR[0];
    fields[7][1]                                 = rr3*atomI.inducedDipolePolarS[1] + dDotDelta*deltaR[1];
    fields[7][2]                                 = rr3*atomI.inducedDipolePolarS[2] + dDotDelta*deltaR[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
117
118
119
120


}

121
__device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
Mark Friedrichs's avatar
Mark Friedrichs committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
                                                                   float gkField[8][3]

#ifdef AMOEBA_DEBUG
               , float4* debugArray
#endif

 )
{

    float gux[5];
    float guy[5];
    float guz[5];
    float a[3][3];
    
    // ---------------------------------------------------------------------------------------
    
138
139
140
    float xr               = atomJ.x - atomI.x;
    float yr               = atomJ.y - atomI.y;
    float zr               = atomJ.z - atomI.z;
Mark Friedrichs's avatar
Mark Friedrichs committed
141
142
143
144
145

    float xr2              = xr*xr;
    float yr2              = yr*yr;
    float zr2              = zr*zr;

146
147
    float rb2              = atomI.bornRadius*atomJ.bornRadius;

Mark Friedrichs's avatar
Mark Friedrichs committed
148
149
150
151
152
153
154
155
156
157
    float r2               = xr2 + yr2 + zr2;
    float expterm          = expf(-r2/(cAmoebaSim.gkc*rb2));
    float expc             = expterm /cAmoebaSim.gkc; 
    //float dexpc            = -2.0f / (cAmoebaSim.gkc*rb2);

    float gf2              = 1.0f / (r2+rb2*expterm);
    float gf               = sqrtf(gf2);
    float gf3              = gf2 * gf;
    float gf5              = gf3 * gf2;

158
159
160
    float duixs            = atomI.inducedDipoleS[0];
    float duiys            = atomI.inducedDipoleS[1];
    float duizs            = atomI.inducedDipoleS[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
161

162
163
164
    float puixs            = atomI.inducedDipolePolarS[0];
    float puiys            = atomI.inducedDipolePolarS[1];
    float puizs            = atomI.inducedDipolePolarS[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
165
 
166
167
168
    float dukxs            = atomJ.inducedDipoleS[0];
    float dukys            = atomJ.inducedDipoleS[1];
    float dukzs            = atomJ.inducedDipoleS[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
169

170
171
172
    float pukxs            = atomJ.inducedDipolePolarS[0];
    float pukys            = atomJ.inducedDipolePolarS[1];
    float pukzs            = atomJ.inducedDipolePolarS[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
 
    // reaction potential auxiliary terms
 
    a[1][0]                = -gf3;
    a[2][0]                = 3.0f * gf5;

    // reaction potential gradient auxiliary terms

    float expc1            = 1.0f - expc;
    a[1][1]                = expc1 * a[2][0];
 
    // unweighted dipole reaction potential gradient tensor

    gux[2]                 = cAmoebaSim.fd * (a[1][0] + xr2*a[1][1]);
    gux[3]                 = cAmoebaSim.fd * xr*yr*a[1][1];
    gux[4]                 = cAmoebaSim.fd * xr*zr*a[1][1];

    guy[2]                 = gux[3];
    guy[3]                 = cAmoebaSim.fd * (a[1][0] + yr2*a[1][1]);
    guy[4]                 = cAmoebaSim.fd * yr*zr*a[1][1];

    guz[2]                 = gux[4];
    guz[3]                 = guy[4];
    guz[4]                 = cAmoebaSim.fd * (a[1][0] + zr2*a[1][1]);
 
    gkField[0][0]          = dukxs*gux[2]+dukys*guy[2]+dukzs*guz[2];
    gkField[0][1]          = dukxs*gux[3]+dukys*guy[3]+dukzs*guz[3];
    gkField[0][2]          = dukxs*gux[4]+dukys*guy[4]+dukzs*guz[4];

    gkField[1][0]          = duixs*gux[2]+duiys*guy[2]+duizs*guz[2];
    gkField[1][1]          = duixs*gux[3]+duiys*guy[3]+duizs*guz[3];
    gkField[1][2]          = duixs*gux[4]+duiys*guy[4]+duizs*guz[4];

    gkField[2][0]          = pukxs*gux[2]+pukys*guy[2]+pukzs*guz[2];
    gkField[2][1]          = pukxs*gux[3]+pukys*guy[3]+pukzs*guz[3];
    gkField[2][2]          = pukxs*gux[4]+pukys*guy[4]+pukzs*guz[4];

    gkField[3][0]          = puixs*gux[2]+puiys*guy[2]+puizs*guz[2];
    gkField[3][1]          = puixs*gux[3]+puiys*guy[3]+puizs*guz[3];
    gkField[3][2]          = puixs*gux[4]+puiys*guy[4]+puizs*guz[4];

}

216
#ifdef AMOEBA_DEBUG
217
__device__ static int debugAccumulate( int index, float4* debugArray, float* field, unsigned int addMask, float idLabel )
Mark Friedrichs's avatar
Mark Friedrichs committed
218
{
219
    index                             += cSim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
220
221
222
223
224
225
226
    debugArray[index].x                = addMask ? field[0] : 0.0f;
    debugArray[index].y                = addMask ? field[1] : 0.0f;
    debugArray[index].z                = addMask ? field[2] : 0.0f;
    debugArray[index].w                = idLabel;

    return index;
}
227
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
228
229
230
231
232
233
234
235
236
237
238
239
240
241


// Include versions of the kernels for N^2 calculations.

#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateAmoebaCudaMutualInducedAndGkFields.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaMutualInducedAndGkFields.h"

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
242
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kInitializeMutualInducedAndGkField_kernel(
                   float* fixedEField,
                   float* fixedEFieldPolar,
                   float* fixedGkField,
                   float* polarizability,
                   float* inducedDipole,
                   float* inducedDipolePolar,
                   float* inducedDipoleS,
                   float* inducedDipolePolarS )
{

    int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
259
    if( threadId >= 3*cSim.atoms )return;
Mark Friedrichs's avatar
Mark Friedrichs committed
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275

    fixedEField[threadId]          *= polarizability[threadId];
    inducedDipole[threadId]         = fixedEField[threadId];

    fixedEFieldPolar[threadId]     *= polarizability[threadId];
    inducedDipolePolar[threadId]    = fixedEFieldPolar[threadId];

    fixedGkField[threadId]         *= polarizability[threadId];
    inducedDipoleS[threadId]        = fixedEField[threadId]       + fixedGkField[threadId];
    inducedDipolePolarS[threadId]   = fixedEFieldPolar[threadId]  + fixedGkField[threadId];

}

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
276
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceMutualInducedAndGkFieldDelta_kernel( float* arrayOfDeltas1, float* arrayOfDeltas2,
                                                 float* arrayOfDeltas3, float* arrayOfDeltas4, float* epsilon )
{
    extern __shared__ float4 delta[];

    delta[threadIdx.x].x    = 0.0f;
    delta[threadIdx.x].y    = 0.0f;
    delta[threadIdx.x].z    = 0.0f;
    delta[threadIdx.x].w    = 0.0f;

    unsigned int pos        = threadIdx.x;

    // load deltas

295
    while( pos < 3*cSim.atoms )
Mark Friedrichs's avatar
Mark Friedrichs committed
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
    {   
        delta[threadIdx.x].x  += arrayOfDeltas1[pos];
        delta[threadIdx.x].y  += arrayOfDeltas2[pos];
        delta[threadIdx.x].z  += arrayOfDeltas3[pos];
        delta[threadIdx.x].w  += arrayOfDeltas4[pos];
        pos                   += blockDim.x*gridDim.x;
    }   
    __syncthreads();

    // sum the deltas

    for (int offset = 1; offset < blockDim.x; offset *= 2 )
    {   
        if (threadIdx.x + offset < blockDim.x && (threadIdx.x & (2*offset-1)) == 0)
        {
            delta[threadIdx.x].x   += delta[threadIdx.x+offset].x;
            delta[threadIdx.x].y   += delta[threadIdx.x+offset].y;
            delta[threadIdx.x].z   += delta[threadIdx.x+offset].z;
            delta[threadIdx.x].w   += delta[threadIdx.x+offset].w;
        }
        __syncthreads();
    }   

    // set epsilons

    if (threadIdx.x == 0)
    {   
        epsilon[0]  = delta[0].x;
        epsilon[0]  = epsilon[0] < delta[0].y ? delta[0].y : epsilon[0];
        epsilon[0]  = epsilon[0] < delta[0].z ? delta[0].z : epsilon[0];
        epsilon[0]  = epsilon[0] < delta[0].w ? delta[0].w : epsilon[0];
327
        epsilon[0]  = 48.033324f*sqrtf( epsilon[0]/( (float) cSim.atoms ) );
Mark Friedrichs's avatar
Mark Friedrichs committed
328
#ifdef AMOEBA_DEBUG
329
330
331
332
        epsilon[1]  = 48.033324f*sqrtf( delta[0].x/( (float) cSim.atoms ) );
        epsilon[2]  = 48.033324f*sqrtf( delta[0].y/( (float) cSim.atoms ) );
        epsilon[3]  = 48.033324f*sqrtf( delta[0].z/( (float) cSim.atoms ) );
        epsilon[4]  = 48.033324f*sqrtf( delta[0].w/( (float) cSim.atoms ) );
Mark Friedrichs's avatar
Mark Friedrichs committed
333
334
335
336
337
338
339
340
341
342
343
344
#endif
    }   
}

/**

   matrixProduct/matrixProductP contains epsilon**2 on output

*/
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
345
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
346
347
348
349
350
351
352
353
354
355
356
357
358
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kSorUpdateMutualInducedAndGkField_kernel(
                   float* polarizability,
                   float* inducedDipole, float* inducedDipoleP,
                   float* fixedEField,   float* fixedEFieldP,
                   float* matrixProduct, float* matrixProductP )
{

    float polarSOR = 0.70f;
    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
359
    if( threadId  >= 3*cSim.atoms)return;
Mark Friedrichs's avatar
Mark Friedrichs committed
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377

    float previousDipole                = inducedDipole[threadId];
    float previousDipoleP               = inducedDipoleP[threadId];

    inducedDipole[threadId]             = fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
    inducedDipoleP[threadId]            = fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];

    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );

    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );

}

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
378
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
379
380
381
382
383
384
385
386
387
388
389
390
391
392
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kSorUpdateMutualInducedAndGkFieldS_kernel(
                   float* polarizability,
                   float* inducedDipole, float* inducedDipoleP,
                   float* fixedEField,   float* fixedEFieldP,
                   float* fixedGkField,
                   float* matrixProduct, float* matrixProductP )
{

    float polarSOR = 0.70f;
    int threadId   = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
393
    if( threadId  >= 3*cSim.atoms)return;
Mark Friedrichs's avatar
Mark Friedrichs committed
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417

    float previousDipole                = inducedDipole[threadId];
    float previousDipoleP               = inducedDipoleP[threadId];

    inducedDipole[threadId]             = fixedGkField[threadId]    + fixedEField[threadId]     + polarizability[threadId]*matrixProduct[threadId];
    inducedDipoleP[threadId]            = fixedGkField[threadId]    + fixedEFieldP[threadId]    + polarizability[threadId]*matrixProductP[threadId];

    inducedDipole[threadId]             = previousDipole   + polarSOR*( inducedDipole[threadId]   - previousDipole  );   
    inducedDipoleP[threadId]            = previousDipoleP  + polarSOR*( inducedDipoleP[threadId]  - previousDipoleP );

    matrixProduct[threadId]             = ( inducedDipole[threadId]  - previousDipole  )*( inducedDipole[threadId]  - previousDipole  );
    matrixProductP[threadId]            = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );

}

// reduce psWorkArray_3_1 -> outputArray
// reduce psWorkArray_3_2 -> outputPolarArray
// reduce psWorkArray_3_3 -> outputArrayS
// reduce psWorkArray_3_4 -> outputPolarArrayS

static void kReduceMutualInducedAndGkFields(amoebaGpuContext amoebaGpu,
                                            CUDAStream<float>* outputArray,  CUDAStream<float>* outputPolarArray,
                                            CUDAStream<float>* outputArrayS, CUDAStream<float>* outputPolarArrayS )
{
418
419
420
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
421
                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
Mark Friedrichs's avatar
Mark Friedrichs committed
422
423
    LAUNCHERROR("kReduceMutualInducedAndGkFields1");

424
425
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
426
                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
Mark Friedrichs's avatar
Mark Friedrichs committed
427
428
    LAUNCHERROR("kReduceMutualInducedAndGkFields2");

429
430
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
431
                               amoebaGpu->psWorkArray_3_3->_pDevData, outputArrayS->_pDevData, 0 );
Mark Friedrichs's avatar
Mark Friedrichs committed
432
433
    LAUNCHERROR("kReduceMutualInducedAndGkFields3");

434
435
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
436
                               amoebaGpu->psWorkArray_3_4->_pDevData, outputPolarArrayS->_pDevData, 0 );
Mark Friedrichs's avatar
Mark Friedrichs committed
437
438
439
440
441
442
443
444
    LAUNCHERROR("kReduceMutualInducedAndGkFields4");
}

#ifdef AMOEBA_DEBUG
#if 0
static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
    (void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
445
446
    unsigned int start = bufferIndex*3*gpu->sim.paddedNumberOfAtoms;
    unsigned int stop  = (bufferIndex+1)*3*gpu->sim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
447
448
    for( unsigned int ii = start; ii < stop; ii += 3 ){
        unsigned int ii3Index      = ii/3;
449
450
        unsigned int bufferIndex   = ii3Index/(gpu->sim.paddedNumberOfAtoms);
        unsigned int particleIndex = ii3Index - bufferIndex*(gpu->sim.paddedNumberOfAtoms);
Mark Friedrichs's avatar
Mark Friedrichs committed
451
452
        (void) fprintf( amoebaGpu->log, "   %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                            ii/3,  bufferIndex, particleIndex,
Mark Friedrichs's avatar
Mark Friedrichs committed
453
454
455
456
457
458
                            amoebaGpu->psWorkArray_3_1->_pSysData[ii],
                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+1],
                            amoebaGpu->psWorkArray_3_1->_pSysData[ii+2],
                            amoebaGpu->psWorkArray_3_2->_pSysData[ii],
                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+1],
                            amoebaGpu->psWorkArray_3_2->_pSysData[ii+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
459
460
461
462
463
464
    } 
}

static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{
    (void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
465
466
    for( unsigned int ii = 0; ii < gpu->sim.outputBuffers; ii++ ){
        unsigned int particleIndex = 3*(targetAtom + ii*gpu->sim.paddedNumberOfAtoms);
Mark Friedrichs's avatar
Mark Friedrichs committed
467
468
        (void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", 
                        ii, particleIndex,
Mark Friedrichs's avatar
Mark Friedrichs committed
469
470
471
472
473
474
                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex],
                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+1],
                        amoebaGpu->psWorkArray_3_1->_pSysData[particleIndex+2],
                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex],
                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+1],
                        amoebaGpu->psWorkArray_3_2->_pSysData[particleIndex+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
    } 
}
#endif
#endif

/**---------------------------------------------------------------------------------------

   Compute mutual induce field

   @param amoebaGpu        amoebaGpu context

   --------------------------------------------------------------------------------------- */

static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuContext amoebaGpu,
                                                                    CUDAStream<float>* outputArray,  CUDAStream<float>* outputPolarArray,
                                                                    CUDAStream<float>* outputArrayS, CUDAStream<float>* outputPolarArrayS )
{
  
   // ---------------------------------------------------------------------------------------

    static unsigned int threadsPerBlock = 0;

   // ---------------------------------------------------------------------------------------

  gpuContext gpu    = amoebaGpu->gpuContext;

#ifdef AMOEBA_DEBUG
    static int iteration = 1;
    int targetAtom    = 0;
    static const char* methodName       = "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply";
    if( 1 && amoebaGpu->log ){
506
        (void) fprintf( amoebaGpu->log, "%s\n", methodName );
Mark Friedrichs's avatar
Mark Friedrichs committed
507
508
509
510
        (void) fflush( amoebaGpu->log );
    }
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
Mark Friedrichs's avatar
Mark Friedrichs committed
511
    memset( debugArray->_pSysData,      0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
Mark Friedrichs's avatar
Mark Friedrichs committed
512
513
514
515
516
517
518
519
520
521
    debugArray->Upload();
#endif

    // clear output arrays

    kClearFields_3( amoebaGpu, 4 );

    // set threads/block first time through

    if( threadsPerBlock == 0 ){
522
523
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
Peter Eastman's avatar
Peter Eastman committed
524
            maxThreads = 384;
525
526
527
528
529
        else if (gpu->sm_version >= SM_12)
            maxThreads = 128;
        else
            maxThreads = 64;
        threadsPerBlock = std::min(getThreadsPerBlock( amoebaGpu, sizeof(MutualInducedParticle)), maxThreads);
Mark Friedrichs's avatar
Mark Friedrichs committed
530
531
    }
    
532
533
#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
534
535
        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%lu ixnCt=%lu workUnits=%lu\n",
                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
536
537
538
539
540
541
                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
    }
#endif

Mark Friedrichs's avatar
Mark Friedrichs committed
542
    if (gpu->bOutputBufferPerWarp){
543
        kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
544
545
546
547
                                                                 amoebaGpu->psWorkUnit->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                 amoebaGpu->psWorkArray_3_3->_pDevData,
548
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
549
550
                                                                 amoebaGpu->psWorkArray_3_4->_pDevData,
                                                                 debugArray->_pDevData, targetAtom );
Mark Friedrichs's avatar
Mark Friedrichs committed
551
#else
Mark Friedrichs's avatar
Mark Friedrichs committed
552
                                                                 amoebaGpu->psWorkArray_3_4->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
553
554
555
#endif
    } else {

556
        kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
557
558
559
560
                                                            amoebaGpu->psWorkUnit->_pDevData,
                                                            amoebaGpu->psWorkArray_3_1->_pDevData,
                                                            amoebaGpu->psWorkArray_3_2->_pDevData,
                                                            amoebaGpu->psWorkArray_3_3->_pDevData,
Mark Friedrichs's avatar
Mark Friedrichs committed
561
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
562
563
                                                            amoebaGpu->psWorkArray_3_4->_pDevData,
                                                            debugArray->_pDevData, targetAtom );
Mark Friedrichs's avatar
Mark Friedrichs committed
564
#else
Mark Friedrichs's avatar
Mark Friedrichs committed
565
                                                            amoebaGpu->psWorkArray_3_4->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
#endif

    }
    LAUNCHERROR("kCalculateAmoebaMutualInducedAndGkFields");  

    kReduceMutualInducedAndGkFields( amoebaGpu, outputArray, outputPolarArray, outputArrayS, outputPolarArrayS );

#ifdef AMOEBA_DEBUG
        amoebaGpu->psWorkArray_3_1->Download();
        amoebaGpu->psWorkArray_3_2->Download();
        amoebaGpu->psWorkArray_3_3->Download();
        amoebaGpu->psWorkArray_3_4->Download();

        //printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
        //printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
        //printMiFieldAtomBuffers( amoebaGpu, 100 );
        //printMiFieldBuffer( amoebaGpu, 0 );
        //printMiFieldBuffer( amoebaGpu, 1 );
        //printMiFieldBuffer( amoebaGpu, 37 );
        //printMiFieldBuffer( amoebaGpu, 38 );

587
    if( amoebaGpu->log && iteration == 1 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607

        (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );

        outputArray->Download();
        outputPolarArray->Download();

        outputArrayS->Download();
        outputPolarArrayS->Download();

        debugArray->Download();

        int maxPrint        = 33;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 

            int indexOffset     = ii*3;
    
           // Mi

           (void) fprintf( amoebaGpu->log," Mult[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
608
609
610
                           outputArray->_pSysData[indexOffset],
                           outputArray->_pSysData[indexOffset+1],
                           outputArray->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
611
612
613
614
    
           // Mi polar

           (void) fprintf( amoebaGpu->log," MultP[%16.9e %16.9e %16.9e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
615
616
617
                           outputPolarArray->_pSysData[indexOffset],
                           outputPolarArray->_pSysData[indexOffset+1],
                           outputPolarArray->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
618
619
620
621

           // MiS

           (void) fprintf( amoebaGpu->log,"      MultS[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
622
623
624
                           outputArrayS->_pSysData[indexOffset],
                           outputArrayS->_pSysData[indexOffset+1],
                           outputArrayS->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
625
626
627
628
    
           // Mi polarS

           (void) fprintf( amoebaGpu->log,"MultPS[%16.9e %16.9e %16.9e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
629
630
631
                           outputPolarArrayS->_pSysData[indexOffset],
                           outputPolarArrayS->_pSysData[indexOffset+1],
                           outputPolarArrayS->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
632
633
634
635
636

           // coords

#if 0
            (void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
637
638
639
                            gpu->psPosq4->_pSysData[ii].x,
                            gpu->psPosq4->_pSysData[ii].y,
                            gpu->psPosq4->_pSysData[ii].z);
Mark Friedrichs's avatar
Mark Friedrichs committed
640
641
642
643


           for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
               int debugIndex = jj*gpu->natoms + ii;
Mark Friedrichs's avatar
Mark Friedrichs committed
644
645
646
               float xx       =  gpu->psPosq4->_pSysData[jj].x -  gpu->psPosq4->_pSysData[ii].x;
               float yy       =  gpu->psPosq4->_pSysData[jj].y -  gpu->psPosq4->_pSysData[ii].y;
               float zz       =  gpu->psPosq4->_pSysData[jj].z -  gpu->psPosq4->_pSysData[ii].z;
Mark Friedrichs's avatar
Mark Friedrichs committed
647
648
               (void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
                               ii, jj, xx, yy, zz,
Mark Friedrichs's avatar
Mark Friedrichs committed
649
                               debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z );
Mark Friedrichs's avatar
Mark Friedrichs committed
650
651
652
653
654
655
656
657
658
659

           }
#endif
           if( ii == targetAtom ){
               (void) fprintf( amoebaGpu->log,"\n" );
               int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
               for( int jj = 0; jj < gpu->natoms; jj++ ){
                   int debugIndex = jj;
                   (void) fprintf( amoebaGpu->log,"%4d %4d Rint [%16.9e %16.9e %16.9e %16.9e]\n",
                                   ii, jj,
Mark Friedrichs's avatar
Mark Friedrichs committed
660
661
                                   debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
                                   debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
Mark Friedrichs's avatar
Mark Friedrichs committed
662
663
664
665

                   for( int kk = 0; kk < 9; kk++ ){
                       debugIndex += paddedNumberOfAtoms;
                       (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %5.1f]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
666
                                       debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y, debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
Mark Friedrichs's avatar
Mark Friedrichs committed
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
                   }
               }
               (void) fprintf( amoebaGpu->log,"\n" );
           }
           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
                ii = gpu->natoms - maxPrint;
           }
        }
        (void) fflush( amoebaGpu->log );
        iteration++;

     }
     delete debugArray;
#endif

}

/**---------------------------------------------------------------------------------------

   Compute mutual induce field

   @param amoebaGpu        amoebaGpu context

   --------------------------------------------------------------------------------------- */

static void cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpuContext amoebaGpu )
{
  
   // ---------------------------------------------------------------------------------------

698
    static int timestep                  = 0;
Mark Friedrichs's avatar
Mark Friedrichs committed
699
    timestep++;
Mark Friedrichs's avatar
Update  
Mark Friedrichs committed
700
#ifdef AMOEBA_DEBUG
701
    static const char* methodName        = "cudaComputeAmoebaMutualInducedAndGkFieldBySOR";
Mark Friedrichs's avatar
Update  
Mark Friedrichs committed
702
    std::vector<int> fileId;
Mark Friedrichs's avatar
Mark Friedrichs committed
703
704
705
706
707
708
709
710
711
712
    fileId.resize( 2 );
    fileId[0] = timestep;
    fileId[1] = 1;
#endif

   // ---------------------------------------------------------------------------------------

    int done;
    int iteration;

713
    gpuContext gpu     = amoebaGpu->gpuContext;
Mark Friedrichs's avatar
Mark Friedrichs committed
714
715
716
717
718
719
720
    int numOfElems     = gpu->natoms*3;
    int numThreads     = min( THREADS_PER_BLOCK, numOfElems );
    int numBlocks      = numOfElems/numThreads;

    if( (numOfElems % numThreads) != 0 )numBlocks++;

#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Update  
Mark Friedrichs committed
721
    if( amoebaGpu->log && timestep == 1 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
722
723
724
725
726
727
728
729
730
731
732
733
734
735
        (void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
                        "maxIterations=%d targetEpsilon=%.3e\n", 
                        methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
                        amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
        (void) fflush( amoebaGpu->log );
    }   
#endif

   // ---------------------------------------------------------------------------------------

    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability

    kInitializeMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
736
737
738
739
740
741
742
743
         amoebaGpu->psE_Field->_pDevData,
         amoebaGpu->psE_FieldPolar->_pDevData,
         amoebaGpu->psGk_Field->_pDevData,
         amoebaGpu->psPolarizability->_pDevData,
         amoebaGpu->psInducedDipole->_pDevData,
         amoebaGpu->psInducedDipolePolar->_pDevData,
         amoebaGpu->psInducedDipoleS->_pDevData,
         amoebaGpu->psInducedDipolePolarS->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
744
745
746
747
748
    LAUNCHERROR("kInitializeMutualInducedAndGkField");  

#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

749
750
        (void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName ); fflush( amoebaGpu->log );

Mark Friedrichs's avatar
Mark Friedrichs committed
751
752
753
754
755
756
757
758
759
760
761
762
        amoebaGpu->psE_Field->Download();
        amoebaGpu->psE_FieldPolar->Download();
        amoebaGpu->psInducedDipole->Download(),
        amoebaGpu->psInducedDipolePolar->Download();
        amoebaGpu->psInducedDipoleS->Download(),
        amoebaGpu->psInducedDipolePolarS->Download();
        amoebaGpu->psPolarizability->Download();

        int offset   = 0;
        int maxPrint = 10;
        for( int ii = 0; ii < gpu->natoms; ii++ ){
            (void) fprintf( amoebaGpu->log, "\n%4d pol=%12.4e\n", ii, 
Mark Friedrichs's avatar
Mark Friedrichs committed
763
764
765
766
                            amoebaGpu->psPolarizability->_pSysData[offset] );
            if( amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+1] ||
                amoebaGpu->psPolarizability->_pSysData[offset] != amoebaGpu->psPolarizability->_pSysData[offset+2] ){
                (void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysData[offset+1], amoebaGpu->psPolarizability->_pSysData[offset+2] ); 
Mark Friedrichs's avatar
Mark Friedrichs committed
767
768
769
            }

            (void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e]  Mi[%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
770
771
                            amoebaGpu->psE_Field->_pSysData[offset],       amoebaGpu->psE_Field->_pSysData[offset+1],       amoebaGpu->psE_Field->_pSysData[offset+2],
                            amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
772
            (void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
773
774
                            amoebaGpu->psE_FieldPolar->_pSysData[offset],       amoebaGpu->psE_FieldPolar->_pSysData[offset+1],       amoebaGpu->psE_FieldPolar->_pSysData[offset+2],
                            amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
775
776

            (void) fprintf( amoebaGpu->log,"Gk[%14.6e %14.6e %14.6e] MiS[%14.6e %14.6e %14.6e] MipS[%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
777
778
779
                            amoebaGpu->psGk_Field->_pSysData[offset],            amoebaGpu->psGk_Field->_pSysData[offset+1],            amoebaGpu->psGk_Field->_pSysData[offset+2],
                            amoebaGpu->psInducedDipoleS->_pSysData[offset],      amoebaGpu->psInducedDipoleS->_pSysData[offset+1],      amoebaGpu->psInducedDipoleS->_pSysData[offset+2],
                            amoebaGpu->psInducedDipolePolarS->_pSysData[offset], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+1], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
780
781
782
783
784
785
786
            offset += 3;
            if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii =  (gpu->natoms - maxPrint);
        }   
        (void) fflush( amoebaGpu->log );
    }   
#endif

787
788
789
790
791
792
793
794
795
    // if polarization type is direct, set flags signalling done and return

    if( amoebaGpu->amoebaSim.polarizationType )
    {   
        amoebaGpu->mutualInducedDone          = 1;
        amoebaGpu->mutualInducedConverged     = 1;
        return;
    }   

Mark Friedrichs's avatar
Mark Friedrichs committed
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
    // ---------------------------------------------------------------------------------------
 
    done      = 0;
    iteration = 1;

    while( !done ){

        // matrix multiply

        cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpu,
                                                                amoebaGpu->psWorkVector[0],  amoebaGpu->psWorkVector[1],
                                                                amoebaGpu->psWorkVector[2],  amoebaGpu->psWorkVector[3] );

        LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply");  

    // ---------------------------------------------------------------------------------------

        // post matrix multiply

        kSorUpdateMutualInducedAndGkField_kernel<<< numBlocks, numThreads >>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
816
817
818
819
           amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipole->_pDevData,     amoebaGpu->psInducedDipolePolar->_pDevData,
           amoebaGpu->psE_Field->_pDevData,           amoebaGpu->psE_FieldPolar->_pDevData,
           amoebaGpu->psWorkVector[0]->_pDevData,     amoebaGpu->psWorkVector[1]->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
820
821
822
        LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate1");  

        kSorUpdateMutualInducedAndGkFieldS_kernel<<< numBlocks, numThreads >>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
823
824
825
826
827
           amoebaGpu->psPolarizability->_pDevData,
           amoebaGpu->psInducedDipoleS->_pDevData,    amoebaGpu->psInducedDipolePolarS->_pDevData,
           amoebaGpu->psE_Field->_pDevData,          amoebaGpu->psE_FieldPolar->_pDevData,
           amoebaGpu->psGk_Field->_pDevData,
           amoebaGpu->psWorkVector[2]->_pDevData,     amoebaGpu->psWorkVector[3]->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
828
829
830
831
832
        LAUNCHERROR("cudaComputeAmoebaMutualInducedAndGkFieldSorUpdate2");  

        // get total epsilon -- performing sums on gpu

        kReduceMutualInducedAndGkFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 4*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
833
834
835
           amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData,
           amoebaGpu->psWorkVector[2]->_pDevData, amoebaGpu->psWorkVector[3]->_pDevData,
           amoebaGpu->psCurrentEpsilon->_pDevData );
Mark Friedrichs's avatar
Mark Friedrichs committed
836
837
838
839
840
841
842
843
844
        LAUNCHERROR("kReduceMutualInducedAndGkFieldDelta_kernel");

#if 0
        // get total epsilon -- performing sums on cpu
{
        float sum[4];
        float currentEpsilon = -1.0e30;
        for( int ii = 0; ii < 4; ii++ ){
            sum[ii]                  = cudaGetSum( 3*gpu->natoms, amoebaGpu->psWorkVector[ii]);
845
            sum[ii]                  = 48.033324f*sqrtf( sum[ii]/( (float) gpu->natoms) );
Mark Friedrichs's avatar
Mark Friedrichs committed
846
847
848
849
850
851
852
853
            if( sum[ii] > currentEpsilon ){
                currentEpsilon = sum[ii];
            }
        }

        amoebaGpu->mutualInducedCurrentEpsilon  = currentEpsilon;
        (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d sums=%14.6e %14.6e %14.6e %14.6e\n",
                        methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
Mark Friedrichs's avatar
Mark Friedrichs committed
854
855
                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done, sum[0], sum[1], sum[2], sum[3] );
Mark Friedrichs's avatar
Mark Friedrichs committed
856
857
858
}
#endif

859
        // Debye=48.033324f
860

Mark Friedrichs's avatar
Mark Friedrichs committed
861
        amoebaGpu->psCurrentEpsilon->Download();
Mark Friedrichs's avatar
Mark Friedrichs committed
862
        float currentEpsilon                     = amoebaGpu->psCurrentEpsilon->_pSysData[0];
Mark Friedrichs's avatar
Mark Friedrichs committed
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
        amoebaGpu->mutualInducedCurrentEpsilon   = currentEpsilon;

        if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){ 
            done = 1;
        }

#ifdef AMOEBA_DEBUG
        if( amoebaGpu->log ){
           amoebaGpu->psInducedDipole->Download();
           amoebaGpu->psInducedDipolePolar->Download();
           amoebaGpu->psInducedDipoleS->Download();
           amoebaGpu->psInducedDipolePolarS->Download();
           amoebaGpu->psWorkVector[2]->Download();
           amoebaGpu->psWorkVector[3]->Download();
           amoebaGpu->psGk_Field->Download();

#if 1
           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
                           methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
Mark Friedrichs's avatar
Mark Friedrichs committed
882
883
                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
Mark Friedrichs's avatar
Mark Friedrichs committed
884
885
886
#else
           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
                           methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
Mark Friedrichs's avatar
Mark Friedrichs committed
887
888
889
                           amoebaGpu->psCurrentEpsilon->_pSysData[0], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
Mark Friedrichs's avatar
Mark Friedrichs committed
890
891
892
893
894
895
896
897
898
#endif
           (void) fflush( amoebaGpu->log );

            int offset   = 0;
            int maxPrint = 20;
            for( int ii = 0; ii < gpu->natoms; ii++ ){
                (void) fprintf( amoebaGpu->log, "%4d ", ii ); 
    
                (void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
899
                                amoebaGpu->psInducedDipole->_pSysData[offset], amoebaGpu->psInducedDipole->_pSysData[offset+1], amoebaGpu->psInducedDipole->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
900
                (void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]",
Mark Friedrichs's avatar
Mark Friedrichs committed
901
                                amoebaGpu->psInducedDipolePolar->_pSysData[offset], amoebaGpu->psInducedDipolePolar->_pSysData[offset+1], amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
902
                (void) fprintf( amoebaGpu->log," MiS[%14.6e %14.6e %14.6e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
903
                                amoebaGpu->psInducedDipoleS->_pSysData[offset], amoebaGpu->psInducedDipoleS->_pSysData[offset+1], amoebaGpu->psInducedDipoleS->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
904
                (void) fprintf( amoebaGpu->log,"MipS[%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
905
                                amoebaGpu->psInducedDipolePolarS->_pSysData[offset], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+1], amoebaGpu->psInducedDipolePolarS->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
906
907
/*
                (void) fprintf( amoebaGpu->log,"Gk [%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
908
                                amoebaGpu->psGk_Field->_pSysData[offset], amoebaGpu->psGk_Field->_pSysData[offset+1], amoebaGpu->psGk_Field->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
909
                (void) fprintf( amoebaGpu->log,"W2 [%14.6e %14.6e %14.6e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
910
                                amoebaGpu->psWorkVector[2]->_pSysData[offset], amoebaGpu->psWorkVector[2]->_pSysData[offset+1], amoebaGpu->psWorkVector[2]->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
911
                (void) fprintf( amoebaGpu->log,"W3 [%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
912
                                amoebaGpu->psWorkVector[3]->_pSysData[offset], amoebaGpu->psWorkVector[3]->_pSysData[offset+1], amoebaGpu->psWorkVector[3]->_pSysData[offset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
*/
                if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
                    ii =  (gpu->natoms - maxPrint);
                    offset = 3*(ii+1);
                } else {
                    offset += 3;
                }
            }   
            (void) fflush( amoebaGpu->log );
        }
#endif
        iteration++;
    }

    amoebaGpu->mutualInducedDone             = done;
    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;

Mark Friedrichs's avatar
Mark Friedrichs committed
930
    if( 0 && amoebaGpu->log ){
931
        trackMutualInducedIterations( amoebaGpu, iteration );
Mark Friedrichs's avatar
Update  
Mark Friedrichs committed
932
933
    }

Mark Friedrichs's avatar
Mark Friedrichs committed
934
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
935
    if( 0 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
936
937
938
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
939
940
941
942
943
944
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, NULL, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,       outputVector, NULL, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar,  outputVector, NULL, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipoleS,      outputVector, NULL, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolarS, outputVector, NULL, 1.0f );
        cudaWriteVectorOfDoubleVectorsToFile( "Cuda_GK_MI", fileId, outputVector );
Mark Friedrichs's avatar
Mark Friedrichs committed
945
946
947
948
949
950
951
952
953
954
955
956
     }
#endif

   // ---------------------------------------------------------------------------------------
}

void cudaComputeAmoebaMutualInducedAndGkField( amoebaGpuContext amoebaGpu )
{
    if( amoebaGpu->mutualInducedIterativeMethod == 0 ){
        cudaComputeAmoebaMutualInducedAndGkFieldBySOR( amoebaGpu );
    }
}