kCalculateAmoebaCudaPmeFixedEField.cu 26 KB
Newer Older
1
2
3
4
5

//-----------------------------------------------------------------------------------------

//-----------------------------------------------------------------------------------------

6
#include "cudaKernels.h"
7
8
9
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"

Mark Friedrichs's avatar
Mark Friedrichs committed
10
//#define AMOEBA_DEBUG
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

void SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}

void GetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));         
    RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
38
#elif (__CUDA_ARCH__ >= 120)
39
40
41
42
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
43
static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* EFieldReciprocal,  float* fieldIn, float* fieldOut )
44
45
46
47
48
{
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

    // Reduce field

49
    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
Mark Friedrichs's avatar
Mark Friedrichs committed
50
    //const float term = 0.0f;
51
52
53
54
55
    while (pos < fieldComponents)
    {   

        // self-term included here

Mark Friedrichs's avatar
Mark Friedrichs committed
56
        float totalField = EFieldReciprocal[pos] + term*cAmoebaSim.pLabFrameDipole[pos];
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

        float* pFt       = fieldIn + pos;
        unsigned int i   = outputBuffers;
        while (i >= 4)
        {   
            totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
            pFt        += fieldComponents*4;
            i          -= 4;
        }   

        if (i >= 2)
        {   
            totalField += pFt[0] + pFt[fieldComponents];
            pFt        += fieldComponents*2;
            i          -= 2;
        }   

        if (i > 0)
        {   
            totalField += pFt[0];
        }   
Mark Friedrichs's avatar
Mark Friedrichs committed
78

79
80
81
82
83
        fieldOut[pos]   = totalField;
        pos            += gridDim.x * blockDim.x;
    }   
}

Mark Friedrichs's avatar
Mark Friedrichs committed
84
85
86
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
87
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
88
89
90
91
92
93
94
95
96
97
98
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int outputBuffers,  float* fieldIn, float* fieldOut )
{
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

    // Reduce field

    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
Mark Friedrichs's avatar
Mark Friedrichs committed
99
    //const float term = 0.0;
Mark Friedrichs's avatar
Mark Friedrichs committed
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
    while (pos < fieldComponents)
    {   

        // self-term included here

        float totalField = term*cAmoebaSim.pLabFrameDipole[pos];

        float* pFt       = fieldIn + pos;
        unsigned int i   = outputBuffers;
        while (i >= 4)
        {   
            totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
            pFt        += fieldComponents*4;
            i          -= 4;
        }   

        if (i >= 2)
        {   
            totalField += pFt[0] + pFt[fieldComponents];
            pFt        += fieldComponents*2;
            i          -= 2;
        }   

        if (i > 0)
        {   
            totalField += pFt[0];
        }   

        fieldOut[pos]  += totalField;
        pos            += gridDim.x * blockDim.x;
    }   
}

133
134
135
136
137
// reduce psWorkArray_3_1 -> EField
// reduce psWorkArray_3_2 -> EFieldPolar

static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
{
Mark Friedrichs's avatar
Mark Friedrichs committed
138

139
140
    gpuContext gpu = amoebaGpu->gpuContext;

Mark Friedrichs's avatar
Mark Friedrichs committed
141
142
    // E_FieldPolar = E_Field (reciprocal) + E_FieldPolar (direct) + self

143
144
    kReducePmeEFieldPolar_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                                   gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
Mark Friedrichs's avatar
Mark Friedrichs committed
145
                                   amoebaGpu->psE_Field->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
146
147
    LAUNCHERROR("kReducePmeE_Fields1");

Mark Friedrichs's avatar
Mark Friedrichs committed
148
149
    // E_Field = E_Field (reciprocal) + E_Field (direct) + self

150
151
    kReducePmeEField_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                              gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
Mark Friedrichs's avatar
Mark Friedrichs committed
152
                              amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
153
154
155
156
157
158
    LAUNCHERROR("kReducePmeE_Fields2");
}

// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field

#undef GK
Mark Friedrichs's avatar
Mark Friedrichs committed
159
160
#undef INCLUDE_FIXED_FIELD_BUFFERS
#define INCLUDE_FIXED_FIELD_BUFFERS
161
#include "kCalculateAmoebaCudaFixedFieldParticle.h"
Mark Friedrichs's avatar
Mark Friedrichs committed
162
163
164
165
166
167
168
169
170
171
172
#undef INCLUDE_FIXED_FIELD_BUFFERS
__device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){
    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
    atomI.tempBuffer[1]  += atomJ.tempBuffer[1];
    atomI.tempBuffer[2]  += atomJ.tempBuffer[2];

    atomI.tempBufferP[0] += atomJ.tempBufferP[0];
    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}

173
__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
174
                                                            float dscale, float pscale, float4 fields[3]
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#ifdef AMOEBA_DEBUG
                                                            , float4* pullBack
#endif

 ){

    // compute the real space portion of the Ewald summation
  
    float xr          = atomJ.x - atomI.x;
    float yr          = atomJ.y - atomI.y;
    float zr          = atomJ.z - atomI.z;

    // periodic boundary conditions

    xr               -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
    yr               -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
    zr               -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;

Mark Friedrichs's avatar
Mark Friedrichs committed
193
    float r2          = xr*xr + yr*yr + zr*zr;
Peter Eastman's avatar
Peter Eastman committed
194
    if( r2 <= cSim.nonbondedCutoffSqr ){
Mark Friedrichs's avatar
Mark Friedrichs committed
195

Peter Eastman's avatar
Peter Eastman committed
196
        float r           = sqrtf(r2);
197

Peter Eastman's avatar
Peter Eastman committed
198
        // calculate the error function damping terms
199

Peter Eastman's avatar
Peter Eastman committed
200
        float ralpha      = cSim.alphaEwald*r;
201

Mark Friedrichs's avatar
Mark Friedrichs committed
202
        float bn0         = erfc(ralpha)/r;
Peter Eastman's avatar
Peter Eastman committed
203
204
205
206
        float alsq2       = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
        float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
        float exp2a       = exp(-(ralpha*ralpha));
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
207
        float bn1         = (bn0+alsq2n*exp2a)/r2;
208

Peter Eastman's avatar
Peter Eastman committed
209
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
210
        float bn2         = (3.0f*bn1+alsq2n*exp2a)/r2;
211

Peter Eastman's avatar
Peter Eastman committed
212
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
213
        float bn3         = (5.0f*bn2+alsq2n*exp2a)/r2;
214

Peter Eastman's avatar
Peter Eastman committed
215
        // compute the error function scaled and unscaled terms
216

Peter Eastman's avatar
Peter Eastman committed
217
218
219
220
221
        float scale3      = 1.0f;
        float scale5      = 1.0f;
        float scale7      = 1.0f;
        float damp        = atomI.damp*atomJ.damp;
        if( damp != 0.0f ){
222

Peter Eastman's avatar
Peter Eastman committed
223
224
            float ratio  = (r/damp);
                  ratio  = ratio*ratio*ratio;
225

Peter Eastman's avatar
Peter Eastman committed
226
            float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
227

Peter Eastman's avatar
Peter Eastman committed
228
                  damp   = -pgamma*ratio;
229

Peter Eastman's avatar
Peter Eastman committed
230
231
232
233
234
235
            if( damp > -50.0f) {
                float expdamp = exp(damp);
                scale3        = 1.0f - expdamp;
                scale5        = 1.0f - expdamp*(1.0f-damp);
                scale7        = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp));
            }
236
        }
Peter Eastman's avatar
Peter Eastman committed
237
238
239
        float dsc3        = dscale*scale3;
        float dsc5        = dscale*scale5;
        float dsc7        = dscale*scale7;
240

Peter Eastman's avatar
Peter Eastman committed
241
242
243
        float psc3        = pscale*scale3;
        float psc5        = pscale*scale5;
        float psc7        = pscale*scale7;
244

Peter Eastman's avatar
Peter Eastman committed
245
246
247
248
249
250
        float r3          = (r*r2);
        float r5          = (r3*r2);
        float r7          = (r5*r2);
        float drr3        = (1.0f-dsc3)/r3;
        float drr5        = 3.0f * (1.0f-dsc5)/r5;
        float drr7        = 15.0f * (1.0f-dsc7)/r7;
251

Peter Eastman's avatar
Peter Eastman committed
252
253
254
        float prr3        = (1.0f-psc3) / r3;
        float prr5        = 3.0f *(1.0f-psc5)/r5;
        float prr7        = 15.0f*(1.0f-psc7)/r7;
255

Mark Friedrichs's avatar
Mark Friedrichs committed
256
        float dir         = atomI.labFrameDipole_X*xr      + atomI.labFrameDipole_Y*yr      + atomI.labFrameDipole_Z*zr;
257

Peter Eastman's avatar
Peter Eastman committed
258
259
260
        float qix         = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr;
        float qiy         = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr;
        float qiz         = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr;
261

Peter Eastman's avatar
Peter Eastman committed
262
        float qir         = qix*xr + qiy*yr + qiz*zr;
263

Mark Friedrichs's avatar
Mark Friedrichs committed
264
265
        float dkr         = atomJ.labFrameDipole_X*xr      + atomJ.labFrameDipole_Y*yr      + atomJ.labFrameDipole_Z*zr;

Peter Eastman's avatar
Peter Eastman committed
266
267
268
        float qkx         = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr;
        float qky         = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr;
        float qkz         = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr;
269

Mark Friedrichs's avatar
Mark Friedrichs committed
270
        float qkr         = qkx*xr + qky*yr + qkz*zr;
271

Mark Friedrichs's avatar
Mark Friedrichs committed
272
273
274
        float fim0        = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_X  + 2.0f*bn2*qkx;
        float fim1        = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_Y  + 2.0f*bn2*qky;
        float fim2        = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_Z  + 2.0f*bn2*qkz;
275

Mark Friedrichs's avatar
Mark Friedrichs committed
276
277
278
        float fkm0        = xr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_X  - 2.0f*bn2*qix;
        float fkm1        = yr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_Y  - 2.0f*bn2*qiy;
        float fkm2        = zr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_Z  - 2.0f*bn2*qiz;
279

Mark Friedrichs's avatar
Mark Friedrichs committed
280
281
282
        float fid0        = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx;
        float fid1        = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky;
        float fid2        = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz;
283

Mark Friedrichs's avatar
Mark Friedrichs committed
284
285
286
        float fkd0        = xr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix;
        float fkd1        = yr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy;
        float fkd2        = zr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz;
287

Mark Friedrichs's avatar
Mark Friedrichs committed
288
289
290
        float fip0        = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx;
        float fip1        = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky;
        float fip2        = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz;
291

Mark Friedrichs's avatar
Mark Friedrichs committed
292
293
294
        float fkp0        = xr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix;
        float fkp1        = yr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy;
        float fkp2        = zr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz;
295

Peter Eastman's avatar
Peter Eastman committed
296
        // increment the field at each site due to this interaction
297

298
299
300
        fields[0].x       = fim0 - fid0;
        fields[1].x       = fim1 - fid1;
        fields[2].x       = fim2 - fid2;
Mark Friedrichs's avatar
Mark Friedrichs committed
301

302
303
304
        fields[0].y       = fkm0 - fkd0;
        fields[1].y       = fkm1 - fkd1;
        fields[2].y       = fkm2 - fkd2;
Mark Friedrichs's avatar
Mark Friedrichs committed
305

306
307
308
        fields[0].z       = fim0 - fip0;
        fields[1].z       = fim1 - fip1;
        fields[2].z       = fim2 - fip2;
Mark Friedrichs's avatar
Mark Friedrichs committed
309

310
311
312
        fields[0].w       = fkm0 - fkp0;
        fields[1].w       = fkm1 - fkp1;
        fields[2].w       = fkm2 - fkp2;
313
314
315
 
    } else {

316
317
318
319
        fields[0].x       = 0.0f;
        fields[0].y       = 0.0f;
        fields[0].z       = 0.0f;
        fields[0].w       = 0.0f;
320
    
321
322
323
324
        fields[1].x       = 0.0f;
        fields[1].y       = 0.0f;
        fields[1].z       = 0.0f;
        fields[1].w       = 0.0f;
325
    
326
327
328
329
        fields[2].x       = 0.0f;
        fields[2].y       = 0.0f;
        fields[2].z       = 0.0f;
        fields[2].w       = 0.0f;
330
    }
Mark Friedrichs's avatar
Mark Friedrichs committed
331

332
333
334
335
336
337
338
339
340
341
#ifdef AMOEBA_DEBUG
    pullBack[0].x = xr;
    pullBack[0].y = yr;
    pullBack[0].z = zr;
    pullBack[0].w = r2;

    pullBack[1].x = atomJ.x - atomI.x;
    pullBack[1].y = atomJ.y - atomI.y;
    pullBack[1].z = atomJ.z - atomI.z;
    pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
Mark Friedrichs's avatar
Mark Friedrichs committed
342

343
344
345
346
347
#endif
}

// Include versions of the kernels for N^2 calculations.

Mark Friedrichs's avatar
Mark Friedrichs committed
348
#define METHOD_NAME(a, b) a##Cutoff##b
349
350
351
#include "kCalculateAmoebaCudaPmeFixedEField.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
Mark Friedrichs's avatar
Mark Friedrichs committed
352
#define METHOD_NAME(a, b) a##CutoffByWarp##b
353
354
#include "kCalculateAmoebaCudaPmeFixedEField.h"

Mark Friedrichs's avatar
Mark Friedrichs committed
355
356
357
358
359
360
361
362
363
/**---------------------------------------------------------------------------------------

   Report whether a number is a nan or infinity

   @param number               number to test
   @return 1 if number is  nan or infinity; else return 0

   --------------------------------------------------------------------------------------- */

Mark Friedrichs's avatar
Mark Friedrichs committed
364
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
365
366
367
368
static int isNanOrInfinity( double number ){
    return (number != number || number == std::numeric_limits<double>::infinity() || number == -std::numeric_limits<double>::infinity()) ? 1 : 0; 
}

Mark Friedrichs's avatar
Mark Friedrichs committed
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
static void bubbleSort( std::vector<int>& array, std::vector<int>& track, int length)
{
  int i, j, temp;
  int test; /*use this only if unsure whether the list is already sorted or not*/
  for(i = length - 1; i > 0; i--)
  {
    test=0;
    for(j = 0; j < i; j++)
    {
      if(array[j] > array[j+1]) /* compare neighboring elements */
      {

        temp = array[j];    /* swap array[j] and array[j+1] */
        array[j] = array[j+1];
        array[j+1] = temp;

        temp = track[j];    /* swap array[j] and array[j+1] */
        track[j] = track[j+1];
        track[j+1] = temp;

        test=1;
      }
    } /*end for j*/
    if(test==0) break; /*will exit if the list is sorted!*/
  } /*end for i*/
      
395
396
}
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
397

398
399
400
401
402
403
404
405
/**---------------------------------------------------------------------------------------

   Compute fixed electric field using PME

   @param amoebaGpu        amoebaGpu context

   --------------------------------------------------------------------------------------- */

Mark Friedrichs's avatar
Mark Friedrichs committed
406
static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
407
408
{
  
Mark Friedrichs's avatar
Mark Friedrichs committed
409
410
    static unsigned int threadsPerBlock  = 0;
    gpuContext gpu                       = amoebaGpu->gpuContext;
411
412
413
414
415
416
417

#ifdef AMOEBA_DEBUG
    static const char* methodName = "computeCudaAmoebaPmeFixedEField";
    if( amoebaGpu->log ){
        (void) fprintf( amoebaGpu->log, "\n%s\n", methodName ); (void) fflush( amoebaGpu->log );
    }
    int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
418
419
420
    int slots                                  = 15;
    CUDAStream<float4>* debugArray             = new CUDAStream<float4>(paddedNumberOfAtoms*slots, 1, "DebugArray");
    memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*slots);
421
422
423
424
    debugArray->Upload();

    // print intermediate results for the targetAtom 

Mark Friedrichs's avatar
Mark Friedrichs committed
425
    unsigned int targetAtom  = 0;
Mark Friedrichs's avatar
Mark Friedrichs committed
426
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
427

428
429
    kClearFields_3( amoebaGpu, 2 );

Mark Friedrichs's avatar
Mark Friedrichs committed
430
431
432
433
434
435
436
    // on first pass, set threads/block

    if( threadsPerBlock == 0 ){ 
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
            maxThreads = 384; 
        else if (gpu->sm_version >= SM_12)
437
            maxThreads = 192;
Mark Friedrichs's avatar
Mark Friedrichs committed
438
439
440
441
442
        else
            maxThreads = 64;
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)), maxThreads);
    }    

443
    if (gpu->bOutputBufferPerWarp){
444
        kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
445
                                                                           gpu->sim.pInteractingWorkUnit,
Mark Friedrichs's avatar
Mark Friedrichs committed
446
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
447
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
448
449
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
450
#else
Mark Friedrichs's avatar
Mark Friedrichs committed
451
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
452
453
#endif
    } else {
454
        kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
455
                                                                           gpu->sim.pInteractingWorkUnit,
Mark Friedrichs's avatar
Mark Friedrichs committed
456
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
457
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
458
459
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData,
                                                                           debugArray->_pDevData, targetAtom );
460
#else
Mark Friedrichs's avatar
Mark Friedrichs committed
461
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
462
463
464
465
466
467
468
469
470
#endif
    }
    LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");

    kReducePmeDirectE_Fields( amoebaGpu );

#ifdef AMOEBA_DEBUG
    if( amoebaGpu->log ){
        gpu->psInteractionCount->Download();
Mark Friedrichs's avatar
Mark Friedrichs committed
471
472
473
        (void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeDirectFixedEField:  threadsPerBlock=%u getThreadsPerBlock=%d sizeof=%u shrd=%u\n", 
                        threadsPerBlock, getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle)+sizeof(float3)),
                        (sizeof(FixedFieldParticle)+sizeof(float3)), (sizeof(FixedFieldParticle)+sizeof(float3))*threadsPerBlock );
474
        (void) fprintf( amoebaGpu->log, "AmoebaCutoffForces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u warp=%d\n",
475
                        gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
476
                        sizeof(FixedFieldParticle), sizeof(FixedFieldParticle)*threadsPerBlock,
Mark Friedrichs's avatar
Mark Friedrichs committed
477
                        (*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->bOutputBufferPerWarp );
478
        (void) fflush( amoebaGpu->log );
Mark Friedrichs's avatar
Mark Friedrichs committed
479
/*
480
        (void) fprintf( amoebaGpu->log, "Out WorkArray_3_[1,2]  paddedNumberOfAtoms=%d\n",  gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers );
481
482
        amoebaGpu->psWorkArray_3_1->Download();
        amoebaGpu->psWorkArray_3_2->Download();
483
        for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
Mark Friedrichs's avatar
Mark Friedrichs committed
484
485
486
487
488
489
490
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 

            int indexOffset     = ii*3;

           // buffer 1

           (void) fprintf( amoebaGpu->log,"WArry1[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
491
492
493
                           amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset],
                           amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset+1],
                           amoebaGpu->psWorkArray_3_1->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
494
495
496
497
   
           // buffer 2

           (void) fprintf( amoebaGpu->log,"WArry2[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
498
499
500
                           amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset],
                           amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset+1],
                           amoebaGpu->psWorkArray_3_2->_pSysData[indexOffset+2] );
Mark Friedrichs's avatar
Mark Friedrichs committed
501
502
503
504
505
506
507

           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
                ii = gpu->natoms - maxPrint;
           }
        }
        (void) fflush( amoebaGpu->log );
Mark Friedrichs's avatar
Mark Friedrichs committed
508
509
510
*/
        amoebaGpu->psE_Field->Download();
        amoebaGpu->psE_FieldPolar->Download();
Mark Friedrichs's avatar
Mark Friedrichs committed
511
512
        (void) fprintf( amoebaGpu->log,"E-field (includes self term)" );
        int maxPrint             = 3002;
513
514
515
516
517
518
519
520
        for( int ii = 0; ii < gpu->natoms; ii++ ){
           (void) fprintf( amoebaGpu->log, "%5d ", ii); 

            int indexOffset     = ii*3;

           // E_Field

           (void) fprintf( amoebaGpu->log,"E[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
521
522
523
                           amoebaGpu->psE_Field->_pSysData[indexOffset],
                           amoebaGpu->psE_Field->_pSysData[indexOffset+1],
                           amoebaGpu->psE_Field->_pSysData[indexOffset+2] );
524
525
526
527
   
           // E_Field polar

           (void) fprintf( amoebaGpu->log,"Epol[%16.9e %16.9e %16.9e] ",
Mark Friedrichs's avatar
Mark Friedrichs committed
528
529
530
                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset],
                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+1],
                           amoebaGpu->psE_FieldPolar->_pSysData[indexOffset+2] );
531
532
533
534
535
536
537
538
539
540
541
542

           (void) fprintf( amoebaGpu->log,"\n" );
           if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
                ii = gpu->natoms - maxPrint;
           }
        }
        (void) fflush( amoebaGpu->log );
        (void) fprintf( amoebaGpu->log, "EFields End\n" );

        (void) fprintf( amoebaGpu->log, "DebugQ\n" );
        debugArray->Download();

Mark Friedrichs's avatar
Mark Friedrichs committed
543
544
545
546
547
548
549
550
551
552
553
554
        std::vector<int> indices;
        std::vector<int> track;
        for( int jj = 0; jj < gpu->natoms; jj++ ){
            int debugIndex = jj;
            if( fabs(debugArray->_pSysData[jj+3*paddedNumberOfAtoms].x) > 0.0 ){
                int orderIndex = gpu->psAtomIndex->_pSysData[jj];
                indices.push_back( orderIndex );
                track.push_back( jj );
            }
        }
        bubbleSort( indices, track, static_cast<int>(track.size()) );

555
        int paddedNumberOfAtoms                    = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
556
        amoebaGpu->gpuContext->psPosq4->Download();
Mark Friedrichs's avatar
Mark Friedrichs committed
557
558
559
560
561
562
        unsigned int count = 0;
        float sum0[3] = { 0.0f, 0.0f, 0.0f };
        float sum1[3] = { 0.0f, 0.0f, 0.0f };
        int offset0   = 1;
        int offset1   = 2;
/*
563
564
        for( int jj = 0; jj < gpu->natoms; jj++ ){
            int debugIndex = jj;
Mark Friedrichs's avatar
Mark Friedrichs committed
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
if( fabs(debugArray->_pSysData[jj+3*paddedNumberOfAtoms].x) > 0.0 ){
            int orderIndex = gpu->psAtomIndex->_pSysData[jj];
            count++;
*/

        for( unsigned int ii = 0; ii < track.size(); ii++ ){
            int jj         = track[ii];
            int debugIndex = jj;
            int orderIndex = indices[ii];
            if( orderIndex > 31 && offset0 == 1 ){
                offset0 = 2;
                offset1 = 2;
            }
            count++;

            sum0[0] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].x;
            sum0[1] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].y;
            sum0[2] += debugArray->_pSysData[jj+offset0*paddedNumberOfAtoms].z;

            sum1[0] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].x;
            sum1[1] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].y;
            sum1[2] += debugArray->_pSysData[jj+offset1*paddedNumberOfAtoms].z;

            (void) fprintf( amoebaGpu->log,"%5d %5d %u PmeFixedEField\n", orderIndex, jj, count );
Mark Friedrichs's avatar
Mark Friedrichs committed
589
            for( int kk = 0; kk < 7; kk++ ){
590
                (void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
Mark Friedrichs's avatar
Mark Friedrichs committed
591
592
                                debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
                                debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
593
594
                debugIndex += paddedNumberOfAtoms;
            }
Mark Friedrichs's avatar
Mark Friedrichs committed
595
596
            (void) fprintf( amoebaGpu->log,"%6d %16.9e %16.9e %16.9e %16.9e %16.9e %16.9e %6d %6d cum sumsOp\n", 
                            orderIndex, sum0[0], sum0[1], sum0[2], sum1[0], sum1[1], sum1[2], jj, count );
597
598
599
600
601
602
603
604
605
606
            (void) fprintf( amoebaGpu->log,"\n" );

        }

        // write results to file

        if( 1 ){
            std::vector<int> fileId;
            //fileId.push_back( 0 );
            VectorOfDoubleVectors outputVector;
607
608
609
            //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
            cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
610
611
612
613
614
615
616
            cudaWriteVectorOfDoubleVectorsToFile( "CudaEField", fileId, outputVector );
         }
         delete debugArray;
    }
#endif

}
Mark Friedrichs's avatar
Mark Friedrichs committed
617
618
619

void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
{
Mark Friedrichs's avatar
Mark Friedrichs committed
620

621
    kCalculateAmoebaPMEFixedMultipoles( amoebaGpu );
Mark Friedrichs's avatar
Mark Friedrichs committed
622
    cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
Mark Friedrichs's avatar
Mark Friedrichs committed
623
624
625
626
627
628

    if( 0 ){
        gpuContext gpu                       = amoebaGpu->gpuContext;
        std::vector<int> fileId;
        fileId.push_back( 0 );
        VectorOfDoubleVectors outputVector;
629
        //cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
630
631
        kReduceForces( gpu );
        cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psForce4,              outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
632
633
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_Field,      outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
        cudaLoadCudaFloatArray( gpu->natoms,  3, amoebaGpu->psE_FieldPolar, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
634
635
636
        cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipForceTorqueFixed", fileId, outputVector );
        //cudaWriteVectorOfDoubleVectorsToFile( "CudaRecipEField", fileId, outputVector );
        exit(0);
Mark Friedrichs's avatar
Mark Friedrichs committed
637
    }
Mark Friedrichs's avatar
Mark Friedrichs committed
638

Mark Friedrichs's avatar
Mark Friedrichs committed
639
}