kCalculateAmoebaCudaPmeFixedEField.cu 17.4 KB
Newer Older
1

Mark Friedrichs's avatar
Mark Friedrichs committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Scott Le Grand, Peter Eastman                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */
27

28
#include "cudaKernels.h"
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"

static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;

void SetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status         = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
    status         = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));    
    RTERROR(status, "SetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}

void GetCalculateAmoebaCudaPmeFixedEFieldSim(amoebaGpuContext amoebaGpu)
{
    cudaError_t status;
    gpuContext gpu = amoebaGpu->gpuContext;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
    status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));         
    RTERROR(status, "GetCalculateAmoebaCudaPmeFixedEFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
58
#elif (__CUDA_ARCH__ >= 120)
59
60
61
62
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
63
static void kReducePmeEFieldPolar_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* EFieldReciprocal,  float* fieldIn, float* fieldOut )
64
65
66
67
68
{
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

    // Reduce field

69
    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
Mark Friedrichs's avatar
Mark Friedrichs committed
70
    //const float term = 0.0f;
71
72
73
74
75
    while (pos < fieldComponents)
    {   

        // self-term included here

Mark Friedrichs's avatar
Mark Friedrichs committed
76
        float totalField = EFieldReciprocal[pos] + term*cAmoebaSim.pLabFrameDipole[pos];
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

        float* pFt       = fieldIn + pos;
        unsigned int i   = outputBuffers;
        while (i >= 4)
        {   
            totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
            pFt        += fieldComponents*4;
            i          -= 4;
        }   

        if (i >= 2)
        {   
            totalField += pFt[0] + pFt[fieldComponents];
            pFt        += fieldComponents*2;
            i          -= 2;
        }   

        if (i > 0)
        {   
            totalField += pFt[0];
        }   
Mark Friedrichs's avatar
Mark Friedrichs committed
98

99
100
101
102
103
        fieldOut[pos]   = totalField;
        pos            += gridDim.x * blockDim.x;
    }   
}

Mark Friedrichs's avatar
Mark Friedrichs committed
104
105
106
__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
107
#elif (__CUDA_ARCH__ >= 120)
Mark Friedrichs's avatar
Mark Friedrichs committed
108
109
110
111
112
113
114
115
116
117
118
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kReducePmeEField_kernel( unsigned int fieldComponents, unsigned int outputBuffers,  float* fieldIn, float* fieldOut )
{
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;

    // Reduce field

    const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
Mark Friedrichs's avatar
Mark Friedrichs committed
119
    //const float term = 0.0;
Mark Friedrichs's avatar
Mark Friedrichs committed
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
    while (pos < fieldComponents)
    {   

        // self-term included here

        float totalField = term*cAmoebaSim.pLabFrameDipole[pos];

        float* pFt       = fieldIn + pos;
        unsigned int i   = outputBuffers;
        while (i >= 4)
        {   
            totalField += pFt[0] + pFt[fieldComponents] + pFt[2*fieldComponents] + pFt[3*fieldComponents];
            pFt        += fieldComponents*4;
            i          -= 4;
        }   

        if (i >= 2)
        {   
            totalField += pFt[0] + pFt[fieldComponents];
            pFt        += fieldComponents*2;
            i          -= 2;
        }   

        if (i > 0)
        {   
            totalField += pFt[0];
        }   

        fieldOut[pos]  += totalField;
        pos            += gridDim.x * blockDim.x;
    }   
}

153
154
155
156
157
// reduce psWorkArray_3_1 -> EField
// reduce psWorkArray_3_2 -> EFieldPolar

static void kReducePmeDirectE_Fields(amoebaGpuContext amoebaGpu )
{
Mark Friedrichs's avatar
Mark Friedrichs committed
158

159
160
    gpuContext gpu = amoebaGpu->gpuContext;

Mark Friedrichs's avatar
Mark Friedrichs committed
161
162
    // E_FieldPolar = E_Field (reciprocal) + E_FieldPolar (direct) + self

163
164
    kReducePmeEFieldPolar_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                                   gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
Mark Friedrichs's avatar
Mark Friedrichs committed
165
                                   amoebaGpu->psE_Field->_pDevData, amoebaGpu->psWorkArray_3_2->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData );
166
167
    LAUNCHERROR("kReducePmeE_Fields1");

Mark Friedrichs's avatar
Mark Friedrichs committed
168
169
    // E_Field = E_Field (reciprocal) + E_Field (direct) + self

170
171
    kReducePmeEField_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
                              gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
Mark Friedrichs's avatar
Mark Friedrichs committed
172
                              amoebaGpu->psWorkArray_3_1->_pDevData, amoebaGpu->psE_Field->_pDevData );
173
174
175
176
177
178
    LAUNCHERROR("kReducePmeE_Fields2");
}

// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field

#undef GK
Mark Friedrichs's avatar
Mark Friedrichs committed
179
180
#undef INCLUDE_FIXED_FIELD_BUFFERS
#define INCLUDE_FIXED_FIELD_BUFFERS
181
#include "kCalculateAmoebaCudaFixedFieldParticle.h"
Mark Friedrichs's avatar
Mark Friedrichs committed
182
183
184
185
186
187
188
189
190
191
192
#undef INCLUDE_FIXED_FIELD_BUFFERS
__device__ void sumTempBuffer( FixedFieldParticle& atomI, FixedFieldParticle& atomJ ){
    atomI.tempBuffer[0]  += atomJ.tempBuffer[0];
    atomI.tempBuffer[1]  += atomJ.tempBuffer[1];
    atomI.tempBuffer[2]  += atomJ.tempBuffer[2];

    atomI.tempBufferP[0] += atomJ.tempBufferP[0];
    atomI.tempBufferP[1] += atomJ.tempBufferP[1];
    atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}

193
__device__ void calculateFixedFieldRealSpacePairIxn_kernel( FixedFieldParticle& atomI, FixedFieldParticle& atomJ,
Mark Friedrichs's avatar
Mark Friedrichs committed
194
                                                            float dscale, float pscale, float4 fields[3]){
195
196
197
198
199
200
201
202
203
204
205
206
207

    // compute the real space portion of the Ewald summation
  
    float xr          = atomJ.x - atomI.x;
    float yr          = atomJ.y - atomI.y;
    float zr          = atomJ.z - atomI.z;

    // periodic boundary conditions

    xr               -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
    yr               -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
    zr               -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;

Mark Friedrichs's avatar
Mark Friedrichs committed
208
    float r2          = xr*xr + yr*yr + zr*zr;
Peter Eastman's avatar
Peter Eastman committed
209
    if( r2 <= cSim.nonbondedCutoffSqr ){
Mark Friedrichs's avatar
Mark Friedrichs committed
210

Peter Eastman's avatar
Peter Eastman committed
211
        float r           = sqrtf(r2);
212

Peter Eastman's avatar
Peter Eastman committed
213
        // calculate the error function damping terms
214

Peter Eastman's avatar
Peter Eastman committed
215
        float ralpha      = cSim.alphaEwald*r;
216

Mark Friedrichs's avatar
Mark Friedrichs committed
217
        float bn0         = erfc(ralpha)/r;
Peter Eastman's avatar
Peter Eastman committed
218
219
220
221
        float alsq2       = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
        float alsq2n      = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
        float exp2a       = exp(-(ralpha*ralpha));
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
222
        float bn1         = (bn0+alsq2n*exp2a)/r2;
223

Peter Eastman's avatar
Peter Eastman committed
224
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
225
        float bn2         = (3.0f*bn1+alsq2n*exp2a)/r2;
226

Peter Eastman's avatar
Peter Eastman committed
227
        alsq2n           *= alsq2;
Mark Friedrichs's avatar
Mark Friedrichs committed
228
        float bn3         = (5.0f*bn2+alsq2n*exp2a)/r2;
229

Peter Eastman's avatar
Peter Eastman committed
230
        // compute the error function scaled and unscaled terms
231

Peter Eastman's avatar
Peter Eastman committed
232
233
234
235
236
        float scale3      = 1.0f;
        float scale5      = 1.0f;
        float scale7      = 1.0f;
        float damp        = atomI.damp*atomJ.damp;
        if( damp != 0.0f ){
237

Peter Eastman's avatar
Peter Eastman committed
238
239
            float ratio  = (r/damp);
                  ratio  = ratio*ratio*ratio;
240

Peter Eastman's avatar
Peter Eastman committed
241
            float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
242

Peter Eastman's avatar
Peter Eastman committed
243
                  damp   = -pgamma*ratio;
244

Peter Eastman's avatar
Peter Eastman committed
245
246
247
248
249
250
            if( damp > -50.0f) {
                float expdamp = exp(damp);
                scale3        = 1.0f - expdamp;
                scale5        = 1.0f - expdamp*(1.0f-damp);
                scale7        = 1.0f - expdamp*(1.0f-damp+(0.6f*damp*damp));
            }
251
        }
Peter Eastman's avatar
Peter Eastman committed
252
253
254
        float dsc3        = dscale*scale3;
        float dsc5        = dscale*scale5;
        float dsc7        = dscale*scale7;
255

Peter Eastman's avatar
Peter Eastman committed
256
257
258
        float psc3        = pscale*scale3;
        float psc5        = pscale*scale5;
        float psc7        = pscale*scale7;
259

Peter Eastman's avatar
Peter Eastman committed
260
261
262
263
264
265
        float r3          = (r*r2);
        float r5          = (r3*r2);
        float r7          = (r5*r2);
        float drr3        = (1.0f-dsc3)/r3;
        float drr5        = 3.0f * (1.0f-dsc5)/r5;
        float drr7        = 15.0f * (1.0f-dsc7)/r7;
266

Peter Eastman's avatar
Peter Eastman committed
267
268
269
        float prr3        = (1.0f-psc3) / r3;
        float prr5        = 3.0f *(1.0f-psc5)/r5;
        float prr7        = 15.0f*(1.0f-psc7)/r7;
270

Mark Friedrichs's avatar
Mark Friedrichs committed
271
        float dir         = atomI.labFrameDipole_X*xr      + atomI.labFrameDipole_Y*yr      + atomI.labFrameDipole_Z*zr;
272

Peter Eastman's avatar
Peter Eastman committed
273
274
275
        float qix         = atomI.labFrameQuadrupole_XX*xr + atomI.labFrameQuadrupole_XY*yr + atomI.labFrameQuadrupole_XZ*zr;
        float qiy         = atomI.labFrameQuadrupole_XY*xr + atomI.labFrameQuadrupole_YY*yr + atomI.labFrameQuadrupole_YZ*zr;
        float qiz         = atomI.labFrameQuadrupole_XZ*xr + atomI.labFrameQuadrupole_YZ*yr + atomI.labFrameQuadrupole_ZZ*zr;
276

Peter Eastman's avatar
Peter Eastman committed
277
        float qir         = qix*xr + qiy*yr + qiz*zr;
278

Mark Friedrichs's avatar
Mark Friedrichs committed
279
280
        float dkr         = atomJ.labFrameDipole_X*xr      + atomJ.labFrameDipole_Y*yr      + atomJ.labFrameDipole_Z*zr;

Peter Eastman's avatar
Peter Eastman committed
281
282
283
        float qkx         = atomJ.labFrameQuadrupole_XX*xr + atomJ.labFrameQuadrupole_XY*yr + atomJ.labFrameQuadrupole_XZ*zr;
        float qky         = atomJ.labFrameQuadrupole_XY*xr + atomJ.labFrameQuadrupole_YY*yr + atomJ.labFrameQuadrupole_YZ*zr;
        float qkz         = atomJ.labFrameQuadrupole_XZ*xr + atomJ.labFrameQuadrupole_YZ*yr + atomJ.labFrameQuadrupole_ZZ*zr;
284

Mark Friedrichs's avatar
Mark Friedrichs committed
285
        float qkr         = qkx*xr + qky*yr + qkz*zr;
286

Mark Friedrichs's avatar
Mark Friedrichs committed
287
288
289
        float fim0        = -xr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_X  + 2.0f*bn2*qkx;
        float fim1        = -yr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_Y  + 2.0f*bn2*qky;
        float fim2        = -zr*(bn1*atomJ.q-bn2*dkr+bn3*qkr)    - bn1*atomJ.labFrameDipole_Z  + 2.0f*bn2*qkz;
290

Mark Friedrichs's avatar
Mark Friedrichs committed
291
292
293
        float fkm0        = xr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_X  - 2.0f*bn2*qix;
        float fkm1        = yr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_Y  - 2.0f*bn2*qiy;
        float fkm2        = zr*(bn1*atomI.q+bn2*dir+bn3*qir)     - bn1*atomI.labFrameDipole_Z  - 2.0f*bn2*qiz;
294

Mark Friedrichs's avatar
Mark Friedrichs committed
295
296
297
        float fid0        = -xr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_X + 2.0f*drr5*qkx;
        float fid1        = -yr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Y + 2.0f*drr5*qky;
        float fid2        = -zr*(drr3*atomJ.q-drr5*dkr+drr7*qkr) - drr3*atomJ.labFrameDipole_Z + 2.0f*drr5*qkz;
298

Mark Friedrichs's avatar
Mark Friedrichs committed
299
300
301
        float fkd0        = xr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_X - 2.0f*drr5*qix;
        float fkd1        = yr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_Y - 2.0f*drr5*qiy;
        float fkd2        = zr*(drr3*atomI.q+drr5*dir+drr7*qir)  - drr3*atomI.labFrameDipole_Z - 2.0f*drr5*qiz;
302

Mark Friedrichs's avatar
Mark Friedrichs committed
303
304
305
        float fip0        = -xr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_X + 2.0f*prr5*qkx;
        float fip1        = -yr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Y + 2.0f*prr5*qky;
        float fip2        = -zr*(prr3*atomJ.q-prr5*dkr+prr7*qkr) - prr3*atomJ.labFrameDipole_Z + 2.0f*prr5*qkz;
306

Mark Friedrichs's avatar
Mark Friedrichs committed
307
308
309
        float fkp0        = xr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_X - 2.0f*prr5*qix;
        float fkp1        = yr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_Y - 2.0f*prr5*qiy;
        float fkp2        = zr*(prr3*atomI.q+prr5*dir+prr7*qir)  - prr3*atomI.labFrameDipole_Z - 2.0f*prr5*qiz;
310

Peter Eastman's avatar
Peter Eastman committed
311
        // increment the field at each site due to this interaction
312

313
314
315
        fields[0].x       = fim0 - fid0;
        fields[1].x       = fim1 - fid1;
        fields[2].x       = fim2 - fid2;
Mark Friedrichs's avatar
Mark Friedrichs committed
316

317
318
319
        fields[0].y       = fkm0 - fkd0;
        fields[1].y       = fkm1 - fkd1;
        fields[2].y       = fkm2 - fkd2;
Mark Friedrichs's avatar
Mark Friedrichs committed
320

321
322
323
        fields[0].z       = fim0 - fip0;
        fields[1].z       = fim1 - fip1;
        fields[2].z       = fim2 - fip2;
Mark Friedrichs's avatar
Mark Friedrichs committed
324

325
326
327
        fields[0].w       = fkm0 - fkp0;
        fields[1].w       = fkm1 - fkp1;
        fields[2].w       = fkm2 - fkp2;
328
329
330
 
    } else {

331
332
333
334
        fields[0].x       = 0.0f;
        fields[0].y       = 0.0f;
        fields[0].z       = 0.0f;
        fields[0].w       = 0.0f;
335
    
336
337
338
339
        fields[1].x       = 0.0f;
        fields[1].y       = 0.0f;
        fields[1].z       = 0.0f;
        fields[1].w       = 0.0f;
340
    
341
342
343
344
        fields[2].x       = 0.0f;
        fields[2].y       = 0.0f;
        fields[2].z       = 0.0f;
        fields[2].w       = 0.0f;
345
    }
Mark Friedrichs's avatar
Mark Friedrichs committed
346

347
348
349
350
}

// Include versions of the kernels for N^2 calculations.

Mark Friedrichs's avatar
Mark Friedrichs committed
351
#define METHOD_NAME(a, b) a##Cutoff##b
352
353
354
#include "kCalculateAmoebaCudaPmeFixedEField.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
Mark Friedrichs's avatar
Mark Friedrichs committed
355
#define METHOD_NAME(a, b) a##CutoffByWarp##b
356
357
#include "kCalculateAmoebaCudaPmeFixedEField.h"

Mark Friedrichs's avatar
Mark Friedrichs committed
358
359
360
361
362
363
364
365
366
/**---------------------------------------------------------------------------------------

   Report whether a number is a nan or infinity

   @param number               number to test
   @return 1 if number is  nan or infinity; else return 0

   --------------------------------------------------------------------------------------- */

367
368
369
370
371
372
373
374
/**---------------------------------------------------------------------------------------

   Compute fixed electric field using PME

   @param amoebaGpu        amoebaGpu context

   --------------------------------------------------------------------------------------- */

Mark Friedrichs's avatar
Mark Friedrichs committed
375
static void cudaComputeAmoebaPmeDirectFixedEField( amoebaGpuContext amoebaGpu )
376
377
{
  
Mark Friedrichs's avatar
Mark Friedrichs committed
378
379
    static unsigned int threadsPerBlock  = 0;
    gpuContext gpu                       = amoebaGpu->gpuContext;
380
381
382

    kClearFields_3( amoebaGpu, 2 );

Mark Friedrichs's avatar
Mark Friedrichs committed
383
384
385
386
387
388
389
    // on first pass, set threads/block

    if( threadsPerBlock == 0 ){ 
        unsigned int maxThreads;
        if (gpu->sm_version >= SM_20)
            maxThreads = 384; 
        else if (gpu->sm_version >= SM_12)
390
            maxThreads = 192;
Mark Friedrichs's avatar
Mark Friedrichs committed
391
392
        else
            maxThreads = 64;
Mark Friedrichs's avatar
Mark Friedrichs committed
393
        threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(FixedFieldParticle), gpu->sharedMemoryPerBlock ), maxThreads);
Mark Friedrichs's avatar
Mark Friedrichs committed
394
395
    }    

396
    if (gpu->bOutputBufferPerWarp){
397
        kCalculateAmoebaPmeDirectFixedE_FieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
398
                                                                           gpu->sim.pInteractingWorkUnit,
Mark Friedrichs's avatar
Mark Friedrichs committed
399
400
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
401
    } else {
402
        kCalculateAmoebaPmeDirectFixedE_FieldCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(FixedFieldParticle)*threadsPerBlock>>>(
Mark Friedrichs's avatar
Mark Friedrichs committed
403
                                                                           gpu->sim.pInteractingWorkUnit,
Mark Friedrichs's avatar
Mark Friedrichs committed
404
405
                                                                           amoebaGpu->psWorkArray_3_1->_pDevData,
                                                                           amoebaGpu->psWorkArray_3_2->_pDevData );
406
407
408
409
410
411
    }
    LAUNCHERROR("kCalculateAmoebaPmeDirectFixedE_Field_kernel");

    kReducePmeDirectE_Fields( amoebaGpu );

}
Mark Friedrichs's avatar
Mark Friedrichs committed
412
413
414

void cudaComputeAmoebaPmeFixedEField( amoebaGpuContext amoebaGpu )
{
Mark Friedrichs's avatar
Mark Friedrichs committed
415

416
    kCalculateAmoebaPMEFixedMultipoles( amoebaGpu );
Mark Friedrichs's avatar
Mark Friedrichs committed
417
    cudaComputeAmoebaPmeDirectFixedEField( amoebaGpu );
Mark Friedrichs's avatar
Mark Friedrichs committed
418

Mark Friedrichs's avatar
Mark Friedrichs committed
419
}