kCalculateAmoebaCudaPmeMutualInducedField.cu 34.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
//-----------------------------------------------------------------------------------------

//-----------------------------------------------------------------------------------------

#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"

#include <stdio.h>

using namespace std;

static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

void SetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}

void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}

Mark Friedrichs's avatar
Mark Friedrichs committed
36
37
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
38

Mark Friedrichs's avatar
Mark Friedrichs committed
39
#undef INCLUDE_MI_FIELD_BUFFERS
Mark Friedrichs's avatar
Mark Friedrichs committed
40
//#define INCLUDE_MI_FIELD_BUFFERS 
41
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
Mark Friedrichs's avatar
Mark Friedrichs committed
42
#ifdef INCLUDE_MI_FIELD_BUFFERS
Mark Friedrichs's avatar
Mark Friedrichs committed
43
44
45
46
47
48
49
50
51
52
__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){

    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
    atomI.tempBuffer[1]  += atomJ.tempBuffer[1];
    atomI.tempBuffer[2]  += atomJ.tempBuffer[2];

    atomI.tempBufferP[0] += atomJ.tempBufferP[0];
    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
Mark Friedrichs's avatar
Mark Friedrichs committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#endif

// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field

__device__ void setupMutualInducedFieldPairIxn_kernel( const MutualInducedParticle& atomI, const MutualInducedParticle& atomJ,
                                                       const float uscale, float4* delta, float* preFactor2 ) {

    // compute thedelta->xeal space portion of the Ewald summation
  
    delta->x                = atomJ.x - atomI.x;
    delta->y                = atomJ.y - atomI.y;
    delta->z                = atomJ.z - atomI.z;

    // pdelta->xiodic boundary conditions

    delta->x               -= floor(delta->x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
    delta->y               -= floor(delta->y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
    delta->z               -= floor(delta->z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;

    float r2                = (delta->x*delta->x) + (delta->y*delta->y) + (delta->z*delta->z); 
    if( r2 <= cSim.nonbondedCutoffSqr ){
        float r           = sqrtf(r2);

        // calculate the error function damping terms

        float ralpha      = cSim.alphaEwald*r;

        float bn0         = erfc(ralpha)/r;
        float alsq2       = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
        float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
        float exp2a       = exp(-(ralpha*ralpha));
        alsq2n           *= alsq2;
        float bn1         = (bn0+alsq2n*exp2a)/r2;

        alsq2n           *= alsq2;
        float bn2         = (3.0f*bn1+alsq2n*exp2a)/r2;

        // compute the error function scaled and unscaled terms

        float scale3      = 1.0f;
        float scale5      = 1.0f;
        float damp        = atomI.damp*atomJ.damp;
        if( damp != 0.0f ){

            float ratio  = (r/damp);
                  ratio  = ratio*ratio*ratio;
            float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
                  damp   = -pgamma*ratio;

            if( damp > -50.0f) {
                float expdamp = exp(damp);
                scale3        = 1.0f - expdamp;
                scale5        = 1.0f - expdamp*(1.0f-damp);
            }
        }
        float dsc3        = uscale*scale3;
        float dsc5        = uscale*scale5;

        float r3          = (r*r2);
        float r5          = (r3*r2);
        float rr3         = (1.0f-dsc3)/r3;
        float rr5         = 3.0f*(1.0f-dsc5)/r5;

        delta->w          = rr3 - bn1;
        *preFactor2       = bn2 - rr5;
    } else {
        delta->w = *preFactor2 = 0.0f;
    }
}

__device__ void calculateMutualInducedFieldPairIxn_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {

    float preFactor3  = preFactor2*(inducedDipole[0]*delta.x   + inducedDipole[1]*delta.y  + inducedDipole[2]*delta.z);

    fieldSum[0]      += preFactor3*delta.x + delta.w*inducedDipole[0];
    fieldSum[1]      += preFactor3*delta.y + delta.w*inducedDipole[1];
    fieldSum[2]      += preFactor3*delta.z + delta.w*inducedDipole[2];
}

__device__ void calculateMutualInducedFieldPairIxnNoAdd_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {

    float preFactor3  = preFactor2*(inducedDipole[0]*delta.x   + inducedDipole[1]*delta.y  + inducedDipole[2]*delta.z);

    fieldSum[0]       = preFactor3*delta.x + delta.w*inducedDipole[0];
    fieldSum[1]       = preFactor3*delta.y + delta.w*inducedDipole[1];
    fieldSum[2]       = preFactor3*delta.z + delta.w*inducedDipole[2];
}
140
141
142
143

// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field

__device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
144
                                                                    float uscale, float4 fields[3]
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#ifdef AMOEBA_DEBUG
                                                            , float4* pullBack
#endif

 ){

    // compute the real space portion of the Ewald summation
  
    float xr          = atomJ.x - atomI.x;
    float yr          = atomJ.y - atomI.y;
    float zr          = atomJ.z - atomI.z;

    // periodic boundary conditions

    xr               -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
    yr               -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
    zr               -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;

    float r2          = xr*xr + yr* yr + zr*zr;
Peter Eastman's avatar
Peter Eastman committed
164
165
    if( r2 <= cSim.nonbondedCutoffSqr ){
        float r           = sqrtf(r2);
166

Peter Eastman's avatar
Peter Eastman committed
167
        // calculate the error function damping terms
168

Peter Eastman's avatar
Peter Eastman committed
169
        float ralpha      = cSim.alphaEwald*r;
170

Mark Friedrichs's avatar
Mark Friedrichs committed
171
        float bn0         = erfc(ralpha)/r;
Peter Eastman's avatar
Peter Eastman committed
172
173
174
175
        float alsq2       = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
        float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
        float exp2a       = exp(-(ralpha*ralpha));
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
176
        float bn1         = (bn0+alsq2n*exp2a)/r2;
177

Peter Eastman's avatar
Peter Eastman committed
178
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
179
        float bn2         = (3.0f*bn1+alsq2n*exp2a)/r2;
180

Peter Eastman's avatar
Peter Eastman committed
181
        // compute the error function scaled and unscaled terms
182

Peter Eastman's avatar
Peter Eastman committed
183
184
185
186
        float scale3      = 1.0f;
        float scale5      = 1.0f;
        float damp        = atomI.damp*atomJ.damp;
        if( damp != 0.0f ){
187

Peter Eastman's avatar
Peter Eastman committed
188
189
190
191
            float ratio  = (r/damp);
                  ratio  = ratio*ratio*ratio;
            float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
                  damp   = -pgamma*ratio;
192

Peter Eastman's avatar
Peter Eastman committed
193
194
195
196
197
            if( damp > -50.0f) {
                float expdamp = exp(damp);
                scale3        = 1.0f - expdamp;
                scale5        = 1.0f - expdamp*(1.0f-damp);
            }
198
        }
Peter Eastman's avatar
Peter Eastman committed
199
200
        float dsc3        = uscale*scale3;
        float dsc5        = uscale*scale5;
201

Peter Eastman's avatar
Peter Eastman committed
202
203
204
        float r3          = (r*r2);
        float r5          = (r3*r2);
        float rr3         = (1.0f-dsc3)/r3;
Mark Friedrichs's avatar
Mark Friedrichs committed
205
        float rr5         = 3.0f*(1.0f-dsc5)/r5;
206

Mark Friedrichs's avatar
Mark Friedrichs committed
207
208
        float preFactor1  = rr3 - bn1;
        float preFactor2  = bn2 - rr5;
209

Mark Friedrichs's avatar
Mark Friedrichs committed
210
211
        float dukr        = atomJ.inducedDipole[0]*xr      + atomJ.inducedDipole[1]*yr      + atomJ.inducedDipole[2]*zr;
        float preFactor3  = preFactor2*dukr;
212

Mark Friedrichs's avatar
Mark Friedrichs committed
213
214
215
        fields[0].x       = preFactor3*xr + preFactor1*atomJ.inducedDipole[0];
        fields[1].x       = preFactor3*yr + preFactor1*atomJ.inducedDipole[1];
        fields[2].x       = preFactor3*zr + preFactor1*atomJ.inducedDipole[2];
216
217


Mark Friedrichs's avatar
Mark Friedrichs committed
218
219
        float duir        = atomI.inducedDipole[0]*xr      + atomI.inducedDipole[1]*yr      + atomI.inducedDipole[2]*zr;
        preFactor3        = preFactor2*duir;
220

Mark Friedrichs's avatar
Mark Friedrichs committed
221
222
223
        fields[0].y       = preFactor3*xr + preFactor1*atomI.inducedDipole[0];
        fields[1].y       = preFactor3*yr + preFactor1*atomI.inducedDipole[1];
        fields[2].y       = preFactor3*zr + preFactor1*atomI.inducedDipole[2];
224
225


Mark Friedrichs's avatar
Mark Friedrichs committed
226
227
        float pukr        = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
        preFactor3        = preFactor2*pukr;
228

Mark Friedrichs's avatar
Mark Friedrichs committed
229
230
231
        fields[0].z       = preFactor3*xr + preFactor1*atomJ.inducedDipolePolar[0];
        fields[1].z       = preFactor3*yr + preFactor1*atomJ.inducedDipolePolar[1];
        fields[2].z       = preFactor3*zr + preFactor1*atomJ.inducedDipolePolar[2];
232
233


Mark Friedrichs's avatar
Mark Friedrichs committed
234
235
236
237
238
        float puir        = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
        preFactor3        = preFactor2*puir;
        fields[0].w       = preFactor3*xr + preFactor1*atomI.inducedDipolePolar[0];
        fields[1].w       = preFactor3*yr + preFactor1*atomI.inducedDipolePolar[1];
        fields[2].w       = preFactor3*zr + preFactor1*atomI.inducedDipolePolar[2];
239
240
241

    } else {

242
243
244
245
        fields[0].x       = 0.0f;
        fields[0].y       = 0.0f;
        fields[0].z       = 0.0f;
        fields[0].w       = 0.0f;
246
    
247
248
249
250
        fields[1].x       = 0.0f;
        fields[1].y       = 0.0f;
        fields[1].z       = 0.0f;
        fields[1].w       = 0.0f;
251
    
252
253
254
255
        fields[2].x       = 0.0f;
        fields[2].y       = 0.0f;
        fields[2].z       = 0.0f;
        fields[2].w       = 0.0f;
256
    }
Mark Friedrichs's avatar
Mark Friedrichs committed
257
/*
258
259
260
261
262
263
264
#ifdef AMOEBA_DEBUG
    pullBack[0].x = xr;
    pullBack[0].y = yr;
    pullBack[0].z = zr;
    pullBack[0].w = r2;

    pullBack[1].x = alsq2;
265
266
    pullBack[1].y = bn0;
    pullBack[1].z = bn2;
267
268
269
270
271
272
273
274
275
276
    pullBack[1].w = exp2a;

    pullBack[1].x = atomJ.x - atomI.x;
    pullBack[1].y = atomJ.y - atomI.y;
    pullBack[1].z = atomJ.z - atomI.z;
    pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
    pullBack[1].x = scale3;
    pullBack[1].y = scale5;
    pullBack[1].z = scale7;
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
277
*/
278
279
280
281
}

// Include versions of the kernels for N^2 calculations.

Mark Friedrichs's avatar
Mark Friedrichs committed
282
#define METHOD_NAME(a, b) a##Cutoff##b
283
284
285
#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
Mark Friedrichs's avatar
Mark Friedrichs committed
286
#define METHOD_NAME(a, b) a##CutoffByWarp##b
287
288
289
290
291
#include "kCalculateAmoebaCudaPmeMutualInducedField.h"

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
292
#elif (__CUDA_ARCH__ >= 120)
293
294
295
296
297
298
299
300
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kInitializeMutualInducedField_kernel(
                   int numberOfAtoms,
                   float* fixedEField,
                   float* fixedEFieldPolar,
301
                   float* polarizability )
302
303
{

Mark Friedrichs's avatar
Mark Friedrichs committed
304
    int pos = blockIdx.x*blockDim.x + threadIdx.x;
305
306
307
308
    while( pos < 3*cSim.atoms )
    {   
        fixedEField[pos]         *= polarizability[pos];
        fixedEFieldPolar[pos]    *= polarizability[pos];
309

310
311
        pos                      += blockDim.x*gridDim.x;
    }
312
313
314
315
316
317

}

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
318
#elif (__CUDA_ARCH__ >= 120)
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* arrayOfDeltas1, float* arrayOfDeltas2, float* epsilon )
{
    extern __shared__ float2 delta[];

    delta[threadIdx.x].x    = 0.0f;
    delta[threadIdx.x].y    = 0.0f;

    unsigned int pos = threadIdx.x;

    // load deltas

    while( pos < numberOfEntries )
    {   
        delta[threadIdx.x].x  += arrayOfDeltas1[pos];
        delta[threadIdx.x].y  += arrayOfDeltas2[pos];
        pos                   += blockDim.x*gridDim.x;
    }   
    __syncthreads();

    // sum the deltas

    for (int offset = 1; offset < blockDim.x; offset *= 2 )
    {   
        if (threadIdx.x + offset < blockDim.x && (threadIdx.x & (2*offset-1)) == 0)
        {
            delta[threadIdx.x].x   += delta[threadIdx.x+offset].x;
            delta[threadIdx.x].y   += delta[threadIdx.x+offset].y;
        }
        __syncthreads();
    }   

    // set epsilons

    if (threadIdx.x == 0)
    {   
        epsilon[0]  = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
359
        epsilon[0]  = 48.033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
360
#ifdef AMOEBA_DEBUG
361
362
        epsilon[1]  = 48.033324f*sqrtf( delta[0].x/( (float) (numberOfEntries/3)) );
        epsilon[2]  = 48.033324f*sqrtf( delta[0].y/( (float) (numberOfEntries/3)) );
363
364
365
366
367
368
369
370
371
372
373
374
#endif
    }   
}

/**

   matrixProduct/matrixProductP contains epsilon**2 on output

*/
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
375
#elif (__CUDA_ARCH__ >= 120)
376
377
378
379
380
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kSorUpdateMutualInducedField_kernel(
381
                   float* polarizability,
382
383
384
385
386
                   float* inducedDipole, float* inducedDipoleP,
                   float* fixedEField,   float* fixedEFieldP,
                   float* matrixProduct, float* matrixProductP )
{

387
388
389
390
    int pos                        = blockIdx.x*blockDim.x + threadIdx.x;
    const float term               = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
    const float polarSOR           = 0.70f;

391
392
    while( pos < 3*cSim.atoms )
    {   
393

394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
        float previousDipole           = inducedDipole[pos];
        float previousDipoleP          = inducedDipoleP[pos];
    
        // add self terms to fields
    
        matrixProduct[pos]            +=  term*previousDipole;
        matrixProductP[pos]           +=  term*previousDipoleP;
    
        inducedDipole[pos]             = fixedEField[pos]     + polarizability[pos]*matrixProduct[pos];
        inducedDipoleP[pos]            = fixedEFieldP[pos]    + polarizability[pos]*matrixProductP[pos];
    
        inducedDipole[pos]             = previousDipole   + polarSOR*( inducedDipole[pos]   - previousDipole  );   
        inducedDipoleP[pos]            = previousDipoleP  + polarSOR*( inducedDipoleP[pos]  - previousDipoleP );
    
        matrixProduct[pos]             = ( inducedDipole[pos]  - previousDipole  )*( inducedDipole[pos]  - previousDipole  );
        matrixProductP[pos]            = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
410

411
412
        pos                           += blockDim.x*gridDim.x;
    }
413
414
415
416
417
418
419
420

}

// reduce psWorkArray_3_1
// reduce psWorkArray_3_2

static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{
421
422
423
    gpuContext gpu = amoebaGpu->gpuContext;
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
424
                               amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
425
426
    LAUNCHERROR("kReducePmeMI_Fields1");

427
428
    kReduceFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                               gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
429
                               amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
    LAUNCHERROR("kReducePmeMI_Fields2");
}

/**---------------------------------------------------------------------------------------

   Compute mutual induce field

   @param amoebaGpu        amoebaGpu context

   --------------------------------------------------------------------------------------- */

static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuContext amoebaGpu,
                                                                  CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{
  
Mark Friedrichs's avatar
Mark Friedrichs committed
445
446
  static unsigned int threadsPerBlock  = 0;
  gpuContext gpu                       = amoebaGpu->gpuContext;
447
448

#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
449
    int targetAtom                = 546;
450
    static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply";
451
    static int iteration          = 1;
452
    if( 1 && amoebaGpu->log ){
Mark Friedrichs's avatar
Mark Friedrichs committed
453
        (void) fprintf( amoebaGpu->log, "%s\n", methodName );
454
455
456
        (void) fflush( amoebaGpu->log );
    }
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
457
    int maxSlots                               = 10;
Mark Friedrichs's avatar
Mark Friedrichs committed
458
459
    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(maxSlots*paddedNumberOfAtoms, 1, "DebugArray");
    memset( debugArray->_pSysData,      0, sizeof( float )*4*maxSlots*paddedNumberOfAtoms);
460
461
462
463
464
    debugArray->Upload();
#endif

    kClearFields_3( amoebaGpu, 2 );

Mark Friedrichs's avatar
Mark Friedrichs committed
465
466
467
468
469
470
471
472
473
474
    // on first pass, set threads/block

    if( threadsPerBlock == 0 ){  
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
            maxThreads = 384; 
        else if (gpu->sm_version >= SM_12)
            maxThreads = 128; 
        else
            maxThreads = 64; 
Mark Friedrichs's avatar
Mark Friedrichs committed
475
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
Mark Friedrichs's avatar
Mark Friedrichs committed
476
477
    }    

Mark Friedrichs's avatar
Mark Friedrichs committed
478
#ifdef AMOEBA_DEBUG
479
    if( amoebaGpu->log ){
Mark Friedrichs's avatar
Mark Friedrichs committed
480
481
        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
482
483
484
485
                        sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
        (void) fflush( amoebaGpu->log );
    }
Mark Friedrichs's avatar
Mark Friedrichs committed
486
#endif
487
488

    if (gpu->bOutputBufferPerWarp){
489

490
        kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
491
                                                                 gpu->sim.pInteractingWorkUnit,
Mark Friedrichs's avatar
Mark Friedrichs committed
492
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
493
494
495
496
#ifdef AMOEBA_DEBUG
                                                                 amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                 debugArray->_pDevData, targetAtom );
#else
Mark Friedrichs's avatar
Mark Friedrichs committed
497
                                                                 amoebaGpu->psWorkArray_3_2->_pDevData );
498
#endif
499
500
501

    } else {

502
        kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
503
                                                                 gpu->sim.pInteractingWorkUnit,
Mark Friedrichs's avatar
Mark Friedrichs committed
504
                                                                 amoebaGpu->psWorkArray_3_1->_pDevData,
505
506
507
508
#ifdef AMOEBA_DEBUG
                                                                 amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                 debugArray->_pDevData, targetAtom );
#else
Mark Friedrichs's avatar
Mark Friedrichs committed
509
                                                                 amoebaGpu->psWorkArray_3_2->_pDevData );
510
511
512
#endif


513
514
515
516
517
518
519
    }
    LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField");

    kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );

#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log && iteration == 1 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
520
521
        (void) fprintf( amoebaGpu->log, "Finished maxtrixMultiply kernel execution %d -- Direct only -- self added in kSorUpdateMutualInducedField_kernel\n",
                        iteration ); (void) fflush( amoebaGpu->log );
522
523
        outputArray->Download();
        outputPolarArray->Download();
524
525
        //debugArray->Download();
        int maxPrint = 5;
526
527
528
529
530
531
532
533
        for( int ii = 0; ii < gpu->natoms; ii++ ){
            (void) fprintf( amoebaGpu->log, "%5d ", ii); 
 
             int indexOffset     = ii*3;
     
            // MI
 
            (void) fprintf( amoebaGpu->log,"Mult[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
534
535
536
                            outputArray->_pSysData[indexOffset],
                            outputArray->_pSysData[indexOffset+1],
                            outputArray->_pSysData[indexOffset+2] );
537
538
539
540
     
            // MI polar
 
            (void) fprintf( amoebaGpu->log,"MultP[%16.9e %16.9e %16.9e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
541
542
543
                            outputPolarArray->_pSysData[indexOffset],
                            outputPolarArray->_pSysData[indexOffset+1],
                            outputPolarArray->_pSysData[indexOffset+2] );
544
545
546
547
548
549
550
551
552
553
554
555
            if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
                ii = gpu->natoms - maxPrint;
            }

        }
/*
        int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
        for( int jj = 0; jj < gpu->natoms; jj++ ){
            int debugIndex = jj; 
            (void) fprintf( amoebaGpu->log,"%5d PmeMIMult\n", jj );
            for( int kk = 0; kk < 7; kk++ ){
                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
556
557
                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
558
559
560
561
562
563
564
565
566
567
                debugIndex += paddedNumberOfAtoms;
            }
            (void) fprintf( amoebaGpu->log,"\n" );

        }
*/
        (void) fflush( amoebaGpu->log );
        iteration++;

     }
568
     delete debugArray;
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
#endif

}

/**---------------------------------------------------------------------------------------

   Compute mutual induce field

   @param amoebaGpu        amoebaGpu context

   --------------------------------------------------------------------------------------- */

static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu )
{
  
   // ---------------------------------------------------------------------------------------

586
//#define AMOEBA_DEBUG
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
#ifdef AMOEBA_DEBUG
    static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldBySOR";
    static int timestep = 0;
    std::vector<int> fileId;
    timestep++;
    fileId.resize( 2 );
    fileId[0] = timestep;
    fileId[1] = 1;
#endif

   // ---------------------------------------------------------------------------------------

    int done;
    int iteration;

     gpuContext gpu    = amoebaGpu->gpuContext;

   // ---------------------------------------------------------------------------------------

    // set  E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
    // initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability

609
    kInitializeMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
610
         gpu->natoms,
Mark Friedrichs's avatar
Mark Friedrichs committed
611
612
         amoebaGpu->psE_Field->_pDevData,
         amoebaGpu->psE_FieldPolar->_pDevData,
613
         amoebaGpu->psPolarizability->_pDevData );
614
615
    LAUNCHERROR("AmoebaPmeMutualInducedFieldSetup");  

616
617
618
    cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData,        amoebaGpu->psE_Field->_pDevData,       3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
    cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData,   amoebaGpu->psE_FieldPolar->_pDevData,  3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );

619
620
621
#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){

Mark Friedrichs's avatar
Mark Friedrichs committed
622
623
        std::vector<int> fileId;
        VectorOfDoubleVectors outputVector;
624
625
626
627
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,            outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar,       outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
628
        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeEFieldPolarity", fileId, outputVector );
629
630
631
    }   
#endif

632
633
634
635
636
637
638
639
640
641
    // if polarization type is direct, set flags signalling done and return

    if( amoebaGpu->amoebaSim.polarizationType )
    {
        amoebaGpu->mutualInducedDone          = 1;
        amoebaGpu->mutualInducedConverged     = 1;
        kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
        return;
    }

642
643
644
645
646
647
648
649
650
651
    // ---------------------------------------------------------------------------------------
 
    done      = 0;
    iteration = 1;

    while( !done ){

        // matrix multiply

        cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0],  amoebaGpu->psWorkVector[1] );
652
        kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
653

654
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
655
656
657
658
659
        if( 0 ){
            gpuContext gpu = amoebaGpu->gpuContext;
            std::vector<int> fileId;
            fileId.push_back( iteration );
            VectorOfDoubleVectors outputVector;
660
661
662
663
664
/*
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psPolarizability, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
*/
Mark Friedrichs's avatar
Mark Friedrichs committed
665
666
667
668
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psWorkVector[0], outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psWorkVector[1], outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
669
            cudaWriteVectorOfDoubleVectorsToFile( "CudaPrePostPmeDirectMI", fileId, outputVector );
Mark Friedrichs's avatar
Mark Friedrichs committed
670
        }
671
672
#endif
        // post matrix multiply
Mark Friedrichs's avatar
Mark Friedrichs committed
673

674
675
        kSorUpdateMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
           amoebaGpu->psPolarizability->_pDevData,
Mark Friedrichs's avatar
Mark Friedrichs committed
676
677
678
           amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
           amoebaGpu->psE_Field->_pDevData,       amoebaGpu->psE_FieldPolar->_pDevData,
           amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
679
680
        LAUNCHERROR("kSorUpdatePmeMutualInducedField");  

681
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
682
683
684
685
686
        if( 0 ){
            gpuContext gpu = amoebaGpu->gpuContext;
            std::vector<int> fileId;
            fileId.push_back( iteration );
            VectorOfDoubleVectors outputVector;
687
688
            //cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData );
            //cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData );
Mark Friedrichs's avatar
Mark Friedrichs committed
689
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psPolarizability, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
Mark Friedrichs's avatar
Mark Friedrichs committed
690
691
692
693
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeDirectMI", fileId, outputVector );
        }
694
#endif
695
696
697
        // get total epsilon -- performing sums on gpu

        kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
698
699
           3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData,
           amoebaGpu->psCurrentEpsilon->_pDevData );
700
701
        LAUNCHERROR("kReducePmeMutualInducedFieldDelta");

702
#ifdef AMOEBA_DEBUG
703
        if( 0 && amoebaGpu->log ){ // trackMutualInducedIterations
704
705
            trackMutualInducedIterations( amoebaGpu, iteration);
        }
706
#endif
707

708
        // Debye=48.033324f
709
        amoebaGpu->psCurrentEpsilon->Download();
Mark Friedrichs's avatar
Mark Friedrichs committed
710
        float currentEpsilon                     = amoebaGpu->psCurrentEpsilon->_pSysData[0];
711
712
713
714
715
716
717
718
719
720
721
        amoebaGpu->mutualInducedCurrentEpsilon   = currentEpsilon;

        if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){ 
            done = 1;
        }

#ifdef AMOEBA_DEBUG
        if( amoebaGpu->log ){
           amoebaGpu->psInducedDipole->Download();
           amoebaGpu->psInducedDipolePolar->Download();
#if 1
722
723
           (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldBySOR iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
                           iteration, amoebaGpu->mutualInducedCurrentEpsilon,
Mark Friedrichs's avatar
Mark Friedrichs committed
724
725
                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
726
727
728
#else
           (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
                           methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
Mark Friedrichs's avatar
Mark Friedrichs committed
729
730
731
                           amoebaGpu->psCurrentEpsilon->_pSysData[0], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[1], 
                           amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
732
733
734
#endif
           (void) fflush( amoebaGpu->log );

Mark Friedrichs's avatar
Mark Friedrichs committed
735
736
737
738
739
            if( 0 ){
                gpuContext gpu = amoebaGpu->gpuContext;
                std::vector<int> fileId;
                fileId.push_back( iteration );
                VectorOfDoubleVectors outputVector;
740
741
742
743
                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
Mark Friedrichs's avatar
Mark Friedrichs committed
744
745
746
                cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
            }
/*
747
            int offset   = 0;
Mark Friedrichs's avatar
Mark Friedrichs committed
748
            int maxPrint = 10;
749
750
751
752
            for( int ii = 0; ii < gpu->natoms; ii++ ){
                (void) fprintf( amoebaGpu->log, "%4d ", ii ); 
    
                (void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
753
754
755
                                amoebaGpu->psInducedDipole->_pSysData[offset],
                                amoebaGpu->psInducedDipole->_pSysData[offset+1],
                                amoebaGpu->psInducedDipole->_pSysData[offset+2] );
756
                (void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
757
758
759
                                amoebaGpu->psInducedDipolePolar->_pSysData[offset],
                                amoebaGpu->psInducedDipolePolar->_pSysData[offset+1],
                                amoebaGpu->psInducedDipolePolar->_pSysData[offset+2] );
760
761
762
763
764
765
766
767
                if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
                    ii =  (gpu->natoms - maxPrint);
                    offset = 3*(ii+1);
                } else {
                    offset += 3;
                }
            }   
            (void) fflush( amoebaGpu->log );
Mark Friedrichs's avatar
Mark Friedrichs committed
768
*/
Mark Friedrichs's avatar
Mark Friedrichs committed
769

Mark Friedrichs's avatar
Mark Friedrichs committed
770
771
772
773
            if( 0 ){
                std::vector<int> fileId;
                fileId.push_back( iteration );
                VectorOfDoubleVectors outputVector;
774
775
776
                cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
                cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
Mark Friedrichs's avatar
Mark Friedrichs committed
777
778
                cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
            }
Mark Friedrichs's avatar
Mark Friedrichs committed
779

780
        }
781

Mark Friedrichs's avatar
Mark Friedrichs committed
782
783
784
785
786
        (void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
                        iteration, amoebaGpu->mutualInducedCurrentEpsilon,
                        amoebaGpu->psCurrentEpsilon->_pSysData[1], 
                        amoebaGpu->psCurrentEpsilon->_pSysData[2], done );
        (void) fflush( amoebaGpu->log );
Mark Friedrichs's avatar
Mark Friedrichs committed
787

Mark Friedrichs's avatar
Mark Friedrichs committed
788
789

        if( amoebaGpu->log ){
790
791
792
            (void) fprintf( amoebaGpu->log, "MI iteration=%3d eps %14.6e done=%d\n",
                            iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
            (void) fflush( amoebaGpu->log );
Mark Friedrichs's avatar
Mark Friedrichs committed
793
        }
Mark Friedrichs's avatar
Mark Friedrichs committed
794
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
795

796
        // exit if nan
Mark Friedrichs's avatar
Mark Friedrichs committed
797

798
        if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon ){
Mark Friedrichs's avatar
Mark Friedrichs committed
799
            (void) fprintf( stderr, "PME MI iteration=%3d eps is nan -- exiting.\n", iteration );
800
801
            exit(0);
        }
Mark Friedrichs's avatar
Mark Friedrichs committed
802

803
804
805
806
807
808
        iteration++;
    }

    amoebaGpu->mutualInducedDone             = done;
    amoebaGpu->mutualInducedConverged        = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;

809
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
810
    if( 0 ){
811
812
813
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
Mark Friedrichs's avatar
Mark Friedrichs committed
814
        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,                    outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
815
816
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipole,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
817
818
819
        cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
     }

Mark Friedrichs's avatar
Mark Friedrichs committed
820
821
822
823
824
    if( 0 ){
        static int iteration = 0;
        checkForNans( gpu->natoms,  3, amoebaGpu->psInducedDipole, gpu->psAtomIndex->_pSysData,    ++iteration, "CudaPmeMI", stderr );
        checkForNans( gpu->natoms,  3, amoebaGpu->psInducedDipolePolar, gpu->psAtomIndex->_pSysData, iteration, "CudaPmeMIPolar", stderr );
     }
825
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
826

827
828
829
830
831
832
833
834
835
   // ---------------------------------------------------------------------------------------
}

void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext amoebaGpu )
{
    if( amoebaGpu->mutualInducedIterativeMethod == 0 ){
        cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpu );
    }
}