kCalculateAmoebaCudaPmeFixedEField.cu 27.1 KB
Newer Older
1
2
3
4
5
6
7
8

//-----------------------------------------------------------------------------------------

//-----------------------------------------------------------------------------------------

#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"

Mark Friedrichs's avatar
Mark Friedrichs committed
9
//#define AMOEBA_DEBUG
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

void SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}

void GetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));         
    RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
37
#elif (__CUDA_ARCH__ >= 120)
38
39
40
41
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
42
static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* EFieldReciprocal,  float* fieldIn, float* fieldOut )
43
44
45
46
47
{
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

    // Reduce field

48
    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
Mark Friedrichs's avatar
Mark Friedrichs committed
49
    //const float term = 0.0f;
50
51
52
53
54
    while (pos < fieldComponents)
    {   

        // self-term included here

Mark Friedrichs's avatar
Mark Friedrichs committed
55
        float totalField = EFieldReciprocal[pos] + term*cAmoebaSim.pLabFrameDipole[pos];
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

        float* pFt       = fieldIn + pos;
        unsigned int i   = outputBuffers;
        while (i >= 4)
        {   
            totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
            pFt        += fieldComponents*4;
            i          -= 4;
        }   

        if (i >= 2)
        {   
            totalField += pFt[0] + pFt[fieldComponents];
            pFt        += fieldComponents*2;
            i          -= 2;
        }   

        if (i > 0)
        {   
            totalField += pFt[0];
        }   
Mark Friedrichs's avatar
Mark Friedrichs committed
77

78
79
80
81
82
        fieldOut[pos]   = totalField;
        pos            += gridDim.x * blockDim.x;
    }   
}

Mark Friedrichs's avatar
Mark Friedrichs committed
83
84
85
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
86
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
87
88
89
90
91
92
93
94
95
96
97
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int outputBuffers,  float* fieldIn, float* fieldOut )
{
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

    // Reduce field

    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
Mark Friedrichs's avatar
Mark Friedrichs committed
98
    //const float term = 0.0;
Mark Friedrichs's avatar
Mark Friedrichs committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
    while (pos < fieldComponents)
    {   

        // self-term included here

        float totalField = term*cAmoebaSim.pLabFrameDipole[pos];

        float* pFt       = fieldIn + pos;
        unsigned int i   = outputBuffers;
        while (i >= 4)
        {   
            totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
            pFt        += fieldComponents*4;
            i          -= 4;
        }   

        if (i >= 2)
        {   
            totalField += pFt[0] + pFt[fieldComponents];
            pFt        += fieldComponents*2;
            i          -= 2;
        }   

        if (i > 0)
        {   
            totalField += pFt[0];
        }   

        fieldOut[pos]  += totalField;
        pos            += gridDim.x * blockDim.x;
    }   
}

132
133
134
135
136
// reduce psWorkArray_3_1 -> EField
// reduce psWorkArray_3_2 -> EFieldPolar

static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
{
Mark Friedrichs's avatar
Mark Friedrichs committed
137
138
139
140
141

    // E_FieldPolar = E_Field (reciprocal) + E_FieldPolar (direct) + self

    kReducePmeEFieldPolar_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                                   amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
Mark Friedrichs's avatar
Mark Friedrichs committed
142
                                   amoebaGpu->psE_Field->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
143
144
    LAUNCHERROR("kReducePmeE_Fields1");

Mark Friedrichs's avatar
Mark Friedrichs committed
145
146
147
148
    // E_Field = E_Field (reciprocal) + E_Field (direct) + self

    kReducePmeEField_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
                              amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
Mark Friedrichs's avatar
Mark Friedrichs committed
149
                              amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
150
151
152
153
154
155
    LAUNCHERROR("kReducePmeE_Fields2");
}

// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field

#undef GK
Mark Friedrichs's avatar
Mark Friedrichs committed
156
157
#undef INCLUDE_FIXED_FIELD_BUFFERS
#define INCLUDE_FIXED_FIELD_BUFFERS
158
#include "kCalculateAmoebaCudaFixedFieldParticle.h"
Mark Friedrichs's avatar
Mark Friedrichs committed
159
160
161
162
163
164
165
166
167
168
169
#undef INCLUDE_FIXED_FIELD_BUFFERS
__device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){
    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
    atomI.tempBuffer[1]  += atomJ.tempBuffer[1];
    atomI.tempBuffer[2]  += atomJ.tempBuffer[2];

    atomI.tempBufferP[0] += atomJ.tempBufferP[0];
    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}

170
__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
171
                                                            float dscale, float pscale, float4 fields[3]
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#ifdef AMOEBA_DEBUG
                                                            , float4* pullBack
#endif

 ){

    // compute the real space portion of the Ewald summation
  
    float xr          = atomJ.x - atomI.x;
    float yr          = atomJ.y - atomI.y;
    float zr          = atomJ.z - atomI.z;

    // periodic boundary conditions

    xr               -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
    yr               -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
    zr               -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;

Mark Friedrichs's avatar
Mark Friedrichs committed
190
    float r2          = xr*xr + yr*yr + zr*zr;
Peter Eastman's avatar
Peter Eastman committed
191
    if( r2 <= cSim.nonbondedCutoffSqr ){
Mark Friedrichs's avatar
Mark Friedrichs committed
192

Peter Eastman's avatar
Peter Eastman committed
193
        float r           = sqrtf(r2);
194

Peter Eastman's avatar
Peter Eastman committed
195
        // calculate the error function damping terms
196

Peter Eastman's avatar
Peter Eastman committed
197
        float ralpha      = cSim.alphaEwald*r;
198

Mark Friedrichs's avatar
Mark Friedrichs committed
199
        float bn0         = erfc(ralpha)/r;
Peter Eastman's avatar
Peter Eastman committed
200
201
202
203
        float alsq2       = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
        float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
        float exp2a       = exp(-(ralpha*ralpha));
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
204
        float bn1         = (bn0+alsq2n*exp2a)/r2;
205

Peter Eastman's avatar
Peter Eastman committed
206
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
207
        float bn2         = (3.0f*bn1+alsq2n*exp2a)/r2;
208

Peter Eastman's avatar
Peter Eastman committed
209
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
210
        float bn3         = (5.0f*bn2+alsq2n*exp2a)/r2;
211

Peter Eastman's avatar
Peter Eastman committed
212
        // compute the error function scaled and unscaled terms
213

Peter Eastman's avatar
Peter Eastman committed
214
215
216
217
218
        float scale3      = 1.0f;
        float scale5      = 1.0f;
        float scale7      = 1.0f;
        float damp        = atomI.damp*atomJ.damp;
        if( damp != 0.0f ){
219

Peter Eastman's avatar
Peter Eastman committed
220
221
            float ratio  = (r/damp);
                  ratio  = ratio*ratio*ratio;
222

Peter Eastman's avatar
Peter Eastman committed
223
            float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
224

Peter Eastman's avatar
Peter Eastman committed
225
                  damp   = -pgamma*ratio;
226

Peter Eastman's avatar
Peter Eastman committed
227
228
229
230
231
232
            if( damp > -50.0f) {
                float expdamp = exp(damp);
                scale3        = 1.0f - expdamp;
                scale5        = 1.0f - expdamp*(1.0f-damp);
                scale7        = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp));
            }
233
        }
Peter Eastman's avatar
Peter Eastman committed
234
235
236
        float dsc3        = dscale*scale3;
        float dsc5        = dscale*scale5;
        float dsc7        = dscale*scale7;
237

Peter Eastman's avatar
Peter Eastman committed
238
239
240
        float psc3        = pscale*scale3;
        float psc5        = pscale*scale5;
        float psc7        = pscale*scale7;
241

Peter Eastman's avatar
Peter Eastman committed
242
243
244
245
246
247
        float r3          = (r*r2);
        float r5          = (r3*r2);
        float r7          = (r5*r2);
        float drr3        = (1.0f-dsc3)/r3;
        float drr5        = 3.0f * (1.0f-dsc5)/r5;
        float drr7        = 15.0f * (1.0f-dsc7)/r7;
248

Peter Eastman's avatar
Peter Eastman committed
249
250
251
        float prr3        = (1.0f-psc3) / r3;
        float prr5        = 3.0f *(1.0f-psc5)/r5;
        float prr7        = 15.0f*(1.0f-psc7)/r7;
252

Mark Friedrichs's avatar
Mark Friedrichs committed
253
        float dir         = atomI.labFrameDipole_X*xr      + atomI.labFrameDipole_Y*yr      + atomI.labFrameDipole_Z*zr;
254

Peter Eastman's avatar
Peter Eastman committed
255
256
257
        float qix         = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr;
        float qiy         = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr;
        float qiz         = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr;
258

Peter Eastman's avatar
Peter Eastman committed
259
        float qir         = qix*xr + qiy*yr + qiz*zr;
260

Mark Friedrichs's avatar
Mark Friedrichs committed
261
262
        float dkr         = atomJ.labFrameDipole_X*xr      + atomJ.labFrameDipole_Y*yr      + atomJ.labFrameDipole_Z*zr;

Peter Eastman's avatar
Peter Eastman committed
263
264
265
        float qkx         = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr;
        float qky         = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr;
        float qkz         = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr;
266

Mark Friedrichs's avatar
Mark Friedrichs committed
267
        float qkr         = qkx*xr + qky*yr + qkz*zr;
268

Mark Friedrichs's avatar
Mark Friedrichs committed
269
270
271
        float fim0        = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_X  + 2.0f*bn2*qkx;
        float fim1        = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_Y  + 2.0f*bn2*qky;
        float fim2        = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_Z  + 2.0f*bn2*qkz;
272

Mark Friedrichs's avatar
Mark Friedrichs committed
273
274
275
        float fkm0        = xr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_X  - 2.0f*bn2*qix;
        float fkm1        = yr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_Y  - 2.0f*bn2*qiy;
        float fkm2        = zr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_Z  - 2.0f*bn2*qiz;
276

Mark Friedrichs's avatar
Mark Friedrichs committed
277
278
279
        float fid0        = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx;
        float fid1        = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky;
        float fid2        = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz;
280

Mark Friedrichs's avatar
Mark Friedrichs committed
281
282
283
        float fkd0        = xr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix;
        float fkd1        = yr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy;
        float fkd2        = zr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz;
284

Mark Friedrichs's avatar
Mark Friedrichs committed
285
286
287
        float fip0        = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx;
        float fip1        = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky;
        float fip2        = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz;
288

Mark Friedrichs's avatar
Mark Friedrichs committed
289
290
291
        float fkp0        = xr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix;
        float fkp1        = yr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy;
        float fkp2        = zr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz;
292

Peter Eastman's avatar
Peter Eastman committed
293
        // increment the field at each site due to this interaction
294

295
296
297
        fields[0].x       = fim0 - fid0;
        fields[1].x       = fim1 - fid1;
        fields[2].x       = fim2 - fid2;
Mark Friedrichs's avatar
Mark Friedrichs committed
298

299
300
301
        fields[0].y       = fkm0 - fkd0;
        fields[1].y       = fkm1 - fkd1;
        fields[2].y       = fkm2 - fkd2;
Mark Friedrichs's avatar
Mark Friedrichs committed
302

303
304
305
        fields[0].z       = fim0 - fip0;
        fields[1].z       = fim1 - fip1;
        fields[2].z       = fim2 - fip2;
Mark Friedrichs's avatar
Mark Friedrichs committed
306

307
308
309
        fields[0].w       = fkm0 - fkp0;
        fields[1].w       = fkm1 - fkp1;
        fields[2].w       = fkm2 - fkp2;
310
311
312
 
    } else {

313
314
315
316
        fields[0].x       = 0.0f;
        fields[0].y       = 0.0f;
        fields[0].z       = 0.0f;
        fields[0].w       = 0.0f;
317
    
318
319
320
321
        fields[1].x       = 0.0f;
        fields[1].y       = 0.0f;
        fields[1].z       = 0.0f;
        fields[1].w       = 0.0f;
322
    
323
324
325
326
        fields[2].x       = 0.0f;
        fields[2].y       = 0.0f;
        fields[2].z       = 0.0f;
        fields[2].w       = 0.0f;
327
    }
Mark Friedrichs's avatar
Mark Friedrichs committed
328

329
330
331
332
333
334
335
336
337
338
#ifdef AMOEBA_DEBUG
    pullBack[0].x = xr;
    pullBack[0].y = yr;
    pullBack[0].z = zr;
    pullBack[0].w = r2;

    pullBack[1].x = atomJ.x - atomI.x;
    pullBack[1].y = atomJ.y - atomI.y;
    pullBack[1].z = atomJ.z - atomI.z;
    pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
Mark Friedrichs's avatar
Mark Friedrichs committed
339

340
341
342
343
344
#endif
}

// Include versions of the kernels for N^2 calculations.

Mark Friedrichs's avatar
Mark Friedrichs committed
345
#define METHOD_NAME(a, b) a##Cutoff##b
346
347
348
#include "kCalculateAmoebaCudaPmeFixedEField.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
Mark Friedrichs's avatar
Mark Friedrichs committed
349
#define METHOD_NAME(a, b) a##CutoffByWarp##b
350
351
#include "kCalculateAmoebaCudaPmeFixedEField.h"

Mark Friedrichs's avatar
Mark Friedrichs committed
352
353
354
355
356
357
358
359
360
/**---------------------------------------------------------------------------------------

   Report whether a number is a nan or infinity

   @param number               number to test
   @return 1 if number is  nan or infinity; else return 0

   --------------------------------------------------------------------------------------- */

Mark Friedrichs's avatar
Mark Friedrichs committed
361
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
362
363
364
static int isNanOrInfinity( double number ){
    return (number != number || number == std::numeric_limits<double>::infinity() || number == -std::numeric_limits<double>::infinity()) ? 1 : 0; 
}
Mark Friedrichs's avatar
Mark Friedrichs committed
365
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
366

Mark Friedrichs's avatar
Mark Friedrichs committed
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
static void bubbleSort( std::vector<int>& array, std::vector<int>& track, int length)
{
  int i, j, temp;
  int test; /*use this only if unsure whether the list is already sorted or not*/
  for(i = length - 1; i > 0; i--)
  {
    test=0;
    for(j = 0; j < i; j++)
    {
      if(array[j] > array[j+1]) /* compare neighboring elements */
      {

        temp = array[j];    /* swap array[j] and array[j+1] */
        array[j] = array[j+1];
        array[j+1] = temp;

        temp = track[j];    /* swap array[j] and array[j+1] */
        track[j] = track[j+1];
        track[j+1] = temp;

        test=1;
      }
    } /*end for j*/
    if(test==0) break; /*will exit if the list is sorted!*/
  } /*end for i*/
      
}/*end bubbleSort*/

395
396
397
398
399
400
401
402
/**---------------------------------------------------------------------------------------

   Compute fixed electric field using PME

   @param amoebaGpu        amoebaGpu context

   --------------------------------------------------------------------------------------- */

Mark Friedrichs's avatar
Mark Friedrichs committed
403
static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
404
405
{
  
Mark Friedrichs's avatar
Mark Friedrichs committed
406
407
    static unsigned int threadsPerBlock  = 0;
    gpuContext gpu                       = amoebaGpu->gpuContext;
408
409
410
411
412
413
414

#ifdef AMOEBA_DEBUG
    static const char* methodName = "computeCudaAmoebaPmeFixedEField";
    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log );
    }
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
415
416
417
    int slots                                  = 15;
    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*slots, 1, "DebugArray");
    memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*slots);
418
419
420
421
    debugArray->Upload();

    // print intermediate results for the targetAtom 

Mark Friedrichs's avatar
Mark Friedrichs committed
422
    unsigned int targetAtom  = 0;
Mark Friedrichs's avatar
Mark Friedrichs committed
423
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
424

425
426
    kClearFields_3( amoebaGpu, 2 );

Mark Friedrichs's avatar
Mark Friedrichs committed
427
428
429
430
431
432
433
    // on first pass, set threads/block

    if( threadsPerBlock == 0 ){ 
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
            maxThreads = 384; 
        else if (gpu->sm_version >= SM_12)
434
            maxThreads = 192;
Mark Friedrichs's avatar
Mark Friedrichs committed
435
436
437
438
439
        else
            maxThreads = 64;
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
    }    

440
    if (gpu->bOutputBufferPerWarp){
Mark Friedrichs's avatar
Mark Friedrichs committed
441
        kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
442
                                                                           gpu->sim.pInteractingWorkUnit,
Mark Friedrichs's avatar
Mark Friedrichs committed
443
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
444
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
445
446
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
447
#else
Mark Friedrichs's avatar
Mark Friedrichs committed
448
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
449
450
#endif
    } else {
Mark Friedrichs's avatar
Mark Friedrichs committed
451
        kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
452
                                                                           gpu->sim.pInteractingWorkUnit,
Mark Friedrichs's avatar
Mark Friedrichs committed
453
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
454
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
455
456
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
457
#else
Mark Friedrichs's avatar
Mark Friedrichs committed
458
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
459
460
461
462
463
464
465
466
467
#endif
    }
    LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");

    kReducePmeDirectE_Fields( amoebaGpu );

#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        gpu->psInteractionCount->Download();
Mark Friedrichs's avatar
Mark Friedrichs committed
468
469
470
        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n", 
                        threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
                        (sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
471
        (void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
472
                        amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
473
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock,
Mark Friedrichs's avatar
Mark Friedrichs committed
474
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
475
        (void) fflush( amoebaGpu->log );
Mark Friedrichs's avatar
Mark Friedrichs committed
476
477
/*
        (void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2]  paddedNumberOfAtoms=%d\n",  amoebaGpu->paddedNumberOfAtoms, amoebaGpu->outputBuffers );
478
479
        amoebaGpu->psWorkArray_3_1->Download();
        amoebaGpu->psWorkArray_3_2->Download();
Mark Friedrichs's avatar
Mark Friedrichs committed
480
481
482
483
484
485
486
487
        for( int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 

            int indexOffset     = ii*3;

           // buffer 1

           (void) fprintf( amoebaGpu->log,"WArry1[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
488
489
490
                           amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset],
                           amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset+1],
                           amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
491
492
493
494
   
           // buffer 2

           (void) fprintf( amoebaGpu->log,"WArry2[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
495
496
497
                           amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset],
                           amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset+1],
                           amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
498
499
500
501
502
503
504

           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
                ii = gpu->natoms - maxPrint;
           }
        }
        (void) fflush( amoebaGpu->log );
Mark Friedrichs's avatar
Mark Friedrichs committed
505
506
507
*/
        amoebaGpu->psE_Field->Download();
        amoebaGpu->psE_FieldPolar->Download();
Mark Friedrichs's avatar
Mark Friedrichs committed
508
509
        (void) fprintf( amoebaGpu->log,"E-field (includes self term)" );
        int maxPrint             = 3002;
510
511
512
513
514
515
516
517
        for( int ii = 0; ii < gpu->natoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 

            int indexOffset     = ii*3;

           // E_Field

           (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
518
519
520
                           amoebaGpu->psE_Field->_pSysData[indexOffset],
                           amoebaGpu->psE_Field->_pSysData[indexOffset+1],
                           amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
521
522
523
524
   
           // E_Field polar

           (void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
525
526
527
                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
528
529
530
531
532
533
534
535
536
537
538
539

           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
                ii = gpu->natoms - maxPrint;
           }
        }
        (void) fflush( amoebaGpu->log );
        (void) fprintf( amoebaGpu->log, "EFields End\n" );

        (void) fprintf( amoebaGpu->log, "DebugQ\n" );
        debugArray->Download();

Mark Friedrichs's avatar
Mark Friedrichs committed
540
541
542
543
544
545
546
547
548
549
550
551
        std::vector<int> indices;
        std::vector<int> track;
        for( int jj = 0; jj < gpu->natoms; jj++ ){
            int debugIndex = jj;
            if( fabs(debugArray->_pSysData[jj+3*paddedNumberOfAtoms].x) > 0.0 ){
                int orderIndex = gpu->psAtomIndex->_pSysData[jj];
                indices.push_back( orderIndex );
                track.push_back( jj );
            }
        }
        bubbleSort( indices, track, static_cast<int>(track.size()) );

552
        int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
553
        amoebaGpu->gpuContext->psPosq4->Download();
Mark Friedrichs's avatar
Mark Friedrichs committed
554
555
556
557
558
559
        unsigned int count = 0;
        float sum0[3] = { 0.0f, 0.0f, 0.0f };
        float sum1[3] = { 0.0f, 0.0f, 0.0f };
        int offset0   = 1;
        int offset1   = 2;
/*
560
561
        for( int jj = 0; jj < gpu->natoms; jj++ ){
            int debugIndex = jj;
Mark Friedrichs's avatar
Mark Friedrichs committed
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
if( fabs(debugArray->_pSysData[jj+3*paddedNumberOfAtoms].x) > 0.0 ){
            int orderIndex = gpu->psAtomIndex->_pSysData[jj];
            count++;
*/

        for( unsigned int ii = 0; ii < track.size(); ii++ ){
            int jj         = track[ii];
            int debugIndex = jj;
            int orderIndex = indices[ii];
            if( orderIndex > 31 && offset0 == 1 ){
                offset0 = 2;
                offset1 = 2;
            }
            count++;

            sum0[0] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].x;
            sum0[1] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].y;
            sum0[2] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].z;

            sum1[0] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].x;
            sum1[1] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].y;
            sum1[2] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].z;

            (void) fprintf( amoebaGpu->log,"%5d %5d %u PmeFixedEField\n", orderIndex, jj, count );
Mark Friedrichs's avatar
Mark Friedrichs committed
586
            for( int kk = 0; kk < 7; kk++ ){
587
                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
588
589
                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
590
591
                debugIndex += paddedNumberOfAtoms;
            }
Mark Friedrichs's avatar
Mark Friedrichs committed
592
593
            (void) fprintf( amoebaGpu->log,"%6d %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %6d %6d cum sumsOp\n", 
                            orderIndex, sum0[0], sum0[1], sum0[2], sum1[0], sum1[1], sum1[2], jj, count );
594
595
596
597
598
599
600
601
602
603
            (void) fprintf( amoebaGpu->log,"\n" );

        }

        // write results to file

        if( 1 ){
            std::vector<int> fileId;
            //fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;
604
605
606
            //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
607
608
609
610
611
612
613
            cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
         }
         delete debugArray;
    }
#endif

}
Mark Friedrichs's avatar
Mark Friedrichs committed
614
615
616

void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
{
Mark Friedrichs's avatar
Mark Friedrichs committed
617

618
    kCalculateAmoebaPMEFixedMultipoles( amoebaGpu );
Mark Friedrichs's avatar
Mark Friedrichs committed
619
    cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
Mark Friedrichs's avatar
Mark Friedrichs committed
620
621
622
623
624
625

    if( 0 ){
        gpuContext gpu                       = amoebaGpu->gpuContext;
        std::vector<int> fileId;
        fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
626
627
628
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
Mark Friedrichs's avatar
Mark Friedrichs committed
629
630
631
632
633
634
635
636
637
        cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipEField", fileId, outputVector );
        //exit(0);
    }

    if( 0 ){
        gpuContext gpu                       = amoebaGpu->gpuContext;
        std::vector<int> fileId;
        fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
638
639
640
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
Mark Friedrichs's avatar
Mark Friedrichs committed
641
642
        cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
    }
Mark Friedrichs's avatar
Mark Friedrichs committed
643
644
645
646
647
648
649
650


    if( 0 ){
        cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
        gpuContext gpu                       = amoebaGpu->gpuContext;
        std::vector<int> fileId;
        //fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
651
652
653
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
Mark Friedrichs's avatar
Mark Friedrichs committed
654
655
        cudaWriteVectorOfDoubleVectorsToFile( "CudaDirectEField", fileId, outputVector );
    }
Mark Friedrichs's avatar
Mark Friedrichs committed
656
}