kCalculateAmoebaCudaPmeDirectElectrostatic.h 21.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Scott Le Grand, Peter Eastman                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#include "amoebaScaleFactors.h"

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(384, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(128, 1)
#else
__launch_bounds__(64, 1)
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
37
38
void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
                            unsigned int* workUnit, float* outputForce, float* outputTorque
39
40
41
42
43
44
45

#ifdef AMOEBA_DEBUG
                           , float4* debugArray, unsigned int targetAtom
#endif
){

#ifdef AMOEBA_DEBUG
46
47
    int maxPullIndex = 2;
    float4 pullBack[12];
48
49
#endif

Mark Friedrichs's avatar
Mark Friedrichs committed
50
    extern __shared__ PmeDirectElectrostaticParticle sA[];
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
    unsigned int pos             = warp*numWorkUnits/totalWarps;
    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
    unsigned int lasty           = 0xFFFFFFFF;
    float totalEnergy            = 0.0f;     

    float scalingFactors[LastScalingIndex];

    while (pos < end)
    {

        unsigned int x;
        unsigned int y;
        bool bExclusionFlag;
Mark Friedrichs's avatar
Mark Friedrichs committed
68
69
70
        int  dScaleMask;
        int2 pScaleMask;
        int2 mScaleMask;
71
72
73
74
75
76
77
78
79

        // Extract cell coordinates

        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );

        unsigned int tgx              = threadIdx.x & (GRID - 1);
        unsigned int tbx              = threadIdx.x - tgx;
        unsigned int tj               = tgx;

Mark Friedrichs's avatar
Mark Friedrichs committed
80
        PmeDirectElectrostaticParticle* psA    = &sA[tbx];
81
        unsigned int atomI            = x + tgx;
Mark Friedrichs's avatar
Mark Friedrichs committed
82
83
        PmeDirectElectrostaticParticle localParticle;
        loadPmeDirectElectrostaticShared(&localParticle, atomI );
84

Mark Friedrichs's avatar
Mark Friedrichs committed
85
86
87
        localParticle.force[0]        = 0.0f;
        localParticle.force[1]        = 0.0f;
        localParticle.force[2]        = 0.0f;
88

Mark Friedrichs's avatar
Mark Friedrichs committed
89
90
91
        localParticle.torque[0]       = 0.0f;
        localParticle.torque[1]       = 0.0f;
        localParticle.torque[2]       = 0.0f;
92
93
94
95
96
97
98
99
100
101
102

        scalingFactors[PScaleIndex]   = 1.0f;
        scalingFactors[DScaleIndex]   = 1.0f;
        scalingFactors[UScaleIndex]   = 1.0f;
        scalingFactors[MScaleIndex]   = 1.0f;

        if (x == y) // Handle diagonals uniquely at 50% efficiency
        {

            // load shared data

Mark Friedrichs's avatar
Mark Friedrichs committed
103
            loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), atomI );
104

Mark Friedrichs's avatar
Mark Friedrichs committed
105
            if (bExclusionFlag)
106
107
108
            {
                unsigned int xi       = x >> GRIDBITS;
                unsigned int cell     = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
Mark Friedrichs's avatar
Mark Friedrichs committed
109
110
111
112
113
114
                dScaleMask            = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                pScaleMask            = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                mScaleMask            = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
            } else {
                scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
            }
115

Mark Friedrichs's avatar
Mark Friedrichs committed
116
117
            for (unsigned int j = 0; j < GRID; j++)
            {
118

Mark Friedrichs's avatar
Mark Friedrichs committed
119
120
                float force[3];
                float torque[2][3];
121

Mark Friedrichs's avatar
Mark Friedrichs committed
122
                unsigned int atomJ = y + j;
123

Mark Friedrichs's avatar
Mark Friedrichs committed
124
                // set scale factors
125

Mark Friedrichs's avatar
Mark Friedrichs committed
126
127
                if (bExclusionFlag)
                {
128
129
130
                    getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex );
                    getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex );
                    getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex );
Mark Friedrichs's avatar
Mark Friedrichs committed
131
                }
132

Mark Friedrichs's avatar
Mark Friedrichs committed
133
                // force
134

Mark Friedrichs's avatar
Mark Friedrichs committed
135
136
137
                float energy;
                calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,   psA[j],
                                                               scalingFactors, force, torque, &energy
138
139
140
141
142
#ifdef AMOEBA_DEBUG
, pullBack
#endif
 );

Mark Friedrichs's avatar
Mark Friedrichs committed
143
144
                // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
                // by setting match flag
145

Mark Friedrichs's avatar
Mark Friedrichs committed
146
                unsigned int mask       =  ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
147

Mark Friedrichs's avatar
Mark Friedrichs committed
148
                // add to field at atomI the field due atomJ's charge/dipole/quadrupole
149

Mark Friedrichs's avatar
Mark Friedrichs committed
150
151
152
                localParticle.force[0]            += mask ? force[0]     : 0.0f;
                localParticle.force[1]            += mask ? force[1]     : 0.0f;
                localParticle.force[2]            += mask ? force[2]     : 0.0f;
153

Mark Friedrichs's avatar
Mark Friedrichs committed
154
155
156
                localParticle.torque[0]           += mask ? torque[0][0] : 0.0f;
                localParticle.torque[1]           += mask ? torque[0][1] : 0.0f;
                localParticle.torque[2]           += mask ? torque[0][2] : 0.0f;
157

Mark Friedrichs's avatar
Mark Friedrichs committed
158
                totalEnergy                       += mask ? 0.5*energy   : 0.0f;
159
160

#ifdef AMOEBA_DEBUG
161
162
163
164
165
166
/*
energy =  mask ? 0.5*energy   : 0.0f;
if( atomI < 200 && (fabs( energy ) > 1.0e+8 || energy != energy) ){
    debugSetup( atomI, atomJ, debugArray, pullBack );
} */

167
if( atomI == targetAtom ){
Mark Friedrichs's avatar
Mark Friedrichs committed
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
    unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
    float blockId                      = 1.0f;

    debugArray[index].x                = (float) atomI;
    debugArray[index].y                = (float) atomJ;
    debugArray[index].z                = (float) y;
    debugArray[index].w                = blockId;

    index                             += cAmoebaSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? force[0]     : 0.0f;
    debugArray[index].y                = mask ? force[1]     : 0.0f;
    debugArray[index].z                = mask ? force[2]     : 0.0f;
    debugArray[index].w                = blockId;


    index                             += cAmoebaSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? torque[0][0] : 0.0f;
    debugArray[index].y                = mask ? torque[0][1] : 0.0f;
    debugArray[index].z                = mask ? torque[0][2] : 0.0f;
Mark Friedrichs's avatar
Mark Friedrichs committed
187
    debugArray[index].w                = mask ? energy       : 0.0f;
Mark Friedrichs's avatar
Mark Friedrichs committed
188
189
190
191
192

    index                             += cAmoebaSim.paddedNumberOfAtoms;
    debugArray[index].x                = mask ? torque[0][0] : 0.0f;
    debugArray[index].y                = mask ? torque[0][1] : 0.0f;
    debugArray[index].z                = mask ? torque[0][2] : 0.0f;
Mark Friedrichs's avatar
Mark Friedrichs committed
193
    debugArray[index].w                = (float) (blockIdx.x * blockDim.x + threadIdx.x);
Mark Friedrichs's avatar
Mark Friedrichs committed
194

195
    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
Mark Friedrichs's avatar
Mark Friedrichs committed
196
197
198
199
200
201
        index                             += cAmoebaSim.paddedNumberOfAtoms;
        debugArray[index].x                = pullBack[pullIndex].x;
        debugArray[index].y                = pullBack[pullIndex].y;
        debugArray[index].z                = pullBack[pullIndex].z;
        debugArray[index].w                = pullBack[pullIndex].w;
    }
202
203
204
}
#endif

Mark Friedrichs's avatar
Mark Friedrichs committed
205
            } // end of j-loop
206

Mark Friedrichs's avatar
Mark Friedrichs committed
207
208
209
210
            // include self energy and self torque

            if( atomI < cAmoebaSim.numberOfAtoms ){
                calculatePmeSelfTorqueElectrostaticPairIxn_kernel( localParticle );
Mark Friedrichs's avatar
Mark Friedrichs committed
211
212
213
                float energy;
                calculatePmeSelfEnergyElectrostaticPairIxn_kernel( localParticle, &energy );
                totalEnergy += energy;
Mark Friedrichs's avatar
Mark Friedrichs committed
214
215
            }

216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
            // Write results

#ifdef USE_OUTPUT_BUFFER_PER_WARP
            float  of;
            unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
            of                                  = outputForce[offset];
            of                                 += localParticle.force[0];
            outputForce[offset]                 = of;

            of                                  = outputForce[offset+1];
            of                                 += localParticle.force[1];
            outputForce[offset+1]               = of;

            of                                  = outputForce[offset+2];
            of                                 += localParticle.force[2];
            outputForce[offset+2]               = of;

            of                                  = outputTorque[offset];
            of                                 += localParticle.torque[0];
            outputTorque[offset]                = of;

            of                                  = outputTorque[offset+1];
            of                                 += localParticle.torque[1];
            outputTorque[offset+1]              = of;

            of                                  = outputTorque[offset+2];
            of                                 += localParticle.torque[2];
            outputTorque[offset+2]              = of;

#else
            unsigned int offset                 = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
            outputForce[offset]                 = localParticle.force[0];
            outputForce[offset+1]               = localParticle.force[1];
            outputForce[offset+2]               = localParticle.force[2];

            outputTorque[offset]                = localParticle.torque[0];
            outputTorque[offset+1]              = localParticle.torque[1];
            outputTorque[offset+2]              = localParticle.torque[2];
#endif

Mark Friedrichs's avatar
Mark Friedrichs committed
256
        } else {
257

Mark Friedrichs's avatar
Mark Friedrichs committed
258
259
260
261
262
263
264
265
            if (lasty != y) {

                // load shared data

               loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );

            }

Mark Friedrichs's avatar
Mark Friedrichs committed
266
267
268
269
            unsigned int flags           = cSim.pInteractionFlag[pos];
            if (flags == 0) {
                // No interactions in this block.
            } else {
270

Mark Friedrichs's avatar
Mark Friedrichs committed
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
                if (lasty != y) {
    
                    // load shared data
    
                   loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );
    
                }
    
                sA[threadIdx.x].force[0]     = 0.0f;
                sA[threadIdx.x].force[1]     = 0.0f;
                sA[threadIdx.x].force[2]     = 0.0f;
    
                sA[threadIdx.x].torque[0]    = 0.0f;
                sA[threadIdx.x].torque[1]    = 0.0f;
                sA[threadIdx.x].torque[2]    = 0.0f;
    
                if( bExclusionFlag )
                {
                    unsigned int xi   = x >> GRIDBITS;
                    unsigned int yi   = y >> GRIDBITS;
                    unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
                    dScaleMask        = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                    pScaleMask        = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                    mScaleMask        = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                } else {
                    scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
                }
       
299
300
                for (unsigned int j = 0; j < GRID; j++)
                {
Mark Friedrichs's avatar
Mark Friedrichs committed
301
302
303
304
           
                    unsigned int jIdx  = (flags == 0xFFFFFFFF) ? tj : j;
                    unsigned int atomJ = y + jIdx;
           
305
306
307
                    float force[3];
                    float torque[2][3];
           
Mark Friedrichs's avatar
Mark Friedrichs committed
308
309
310
311
312
313
314
315
316
317
318
                    // set scale factors
           
                    if( bExclusionFlag )
                    {
                        getMaskedDScaleFactor( jIdx, dScaleMask, scalingFactors + DScaleIndex );
                        getMaskedPScaleFactor( jIdx, pScaleMask, scalingFactors + PScaleIndex );
                        getMaskedMScaleFactor( jIdx, mScaleMask, scalingFactors + MScaleIndex );
                    }
           
                    // force
           
319
                    float energy;
Mark Friedrichs's avatar
Mark Friedrichs committed
320
                    calculatePmeDirectElectrostaticPairIxn_kernel( localParticle,   psA[jIdx],
Mark Friedrichs's avatar
Mark Friedrichs committed
321
                                                                   scalingFactors, force, torque, &energy
322
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
323
    , pullBack
324
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
325
326
     );
    
327
                    // check if atoms out-of-bounds
Mark Friedrichs's avatar
Mark Friedrichs committed
328
    
329
                    unsigned int mask    =  ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
Mark Friedrichs's avatar
Mark Friedrichs committed
330
    
331
                    // add force and torque to atom I due atom J
Mark Friedrichs's avatar
Mark Friedrichs committed
332
    
333
334
335
                    localParticle.force[0]         += mask ? force[0]      : 0.0f;
                    localParticle.force[1]         += mask ? force[1]      : 0.0f;
                    localParticle.force[2]         += mask ? force[2]      : 0.0f;
Mark Friedrichs's avatar
Mark Friedrichs committed
336
    
337
338
339
340
                    localParticle.torque[0]        += mask ? torque[0][0]  : 0.0f;
                    localParticle.torque[1]        += mask ? torque[0][1]  : 0.0f;
                    localParticle.torque[2]        += mask ? torque[0][2]  : 0.0f;
    
Mark Friedrichs's avatar
Mark Friedrichs committed
341
                    totalEnergy                    += mask ? energy        : 0.0f;
342
343
344

                    // add force and torque to atom J due atom I

Mark Friedrichs's avatar
Mark Friedrichs committed
345
                    if( flags == 0xFFFFFFFF ){
346

Mark Friedrichs's avatar
Mark Friedrichs committed
347
348
349
350
351
352
353
354
355
356
                        psA[jIdx].force[0]               -= mask ?  force[0]     : 0.0f;
                        psA[jIdx].force[1]               -= mask ?  force[1]     : 0.0f;
                        psA[jIdx].force[2]               -= mask ?  force[2]     : 0.0f;
    
                        psA[jIdx].torque[0]              += mask ?  torque[1][0] : 0.0f;
                        psA[jIdx].torque[1]              += mask ?  torque[1][1] : 0.0f;
                        psA[jIdx].torque[2]              += mask ?  torque[1][2] : 0.0f;

                    } else {

Mark Friedrichs's avatar
Mark Friedrichs committed
357
358
359
                        sA[threadIdx.x].tempForce[0]     = mask ? 0.0f : force[0];
                        sA[threadIdx.x].tempForce[1]     = mask ? 0.0f : force[1];
                        sA[threadIdx.x].tempForce[2]     = mask ? 0.0f : force[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
360
   
Mark Friedrichs's avatar
Mark Friedrichs committed
361
362
363
                        sA[threadIdx.x].tempTorque[0]    = mask ? 0.0f : torque[1][0];
                        sA[threadIdx.x].tempTorque[1]    = mask ? 0.0f : torque[1][1];
                        sA[threadIdx.x].tempTorque[2]    = mask ? 0.0f : torque[1][2];
Mark Friedrichs's avatar
Mark Friedrichs committed
364
365

                        if( tgx % 2 == 0 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
366
                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );  
Mark Friedrichs's avatar
Mark Friedrichs committed
367
368
                        }
                        if( tgx % 4 == 0 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
369
                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );  
Mark Friedrichs's avatar
Mark Friedrichs committed
370
371
                        }
                        if( tgx % 8 == 0 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
372
                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );  
Mark Friedrichs's avatar
Mark Friedrichs committed
373
374
                        }
                        if( tgx % 16 == 0 ){
Mark Friedrichs's avatar
Mark Friedrichs committed
375
                            sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );  
Mark Friedrichs's avatar
Mark Friedrichs committed
376
377
378
379
                        }

                        if (tgx == 0)
                        {
Mark Friedrichs's avatar
Mark Friedrichs committed
380
381
382
                            psA[jIdx].force[0]  -= sA[threadIdx.x].tempForce[0]  + sA[threadIdx.x+16].tempForce[0];
                            psA[jIdx].force[1]  -= sA[threadIdx.x].tempForce[1]  + sA[threadIdx.x+16].tempForce[1];
                            psA[jIdx].force[2]  -= sA[threadIdx.x].tempForce[2]  + sA[threadIdx.x+16].tempForce[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
383

Mark Friedrichs's avatar
Mark Friedrichs committed
384
385
386
                            psA[jIdx].torque[0] += sA[threadIdx.x].tempTorque[0] + sA[threadIdx.x+16].tempTorque[0];
                            psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
                            psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
Mark Friedrichs's avatar
Mark Friedrichs committed
387
388
389
                        }
                    }
 
Mark Friedrichs's avatar
Mark Friedrichs committed
390
                    tj = (tj + 1) & (GRID - 1);
391

Mark Friedrichs's avatar
Mark Friedrichs committed
392
                } // end of j-loop
393

Mark Friedrichs's avatar
Mark Friedrichs committed
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
                // Write results
    
    #ifdef USE_OUTPUT_BUFFER_PER_WARP
    
                float of;
                unsigned int offset                 = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
                of                                  = outputForce[offset];
                of                                 += localParticle.force[0];
                outputForce[offset]                 = of;
    
                of                                  = outputForce[offset+1];
                of                                 += localParticle.force[1];
                outputForce[offset+1]               = of;
    
                of                                  = outputForce[offset+2];
                of                                 += localParticle.force[2];
                outputForce[offset+2]               = of;
    
                of                                  = outputTorque[offset];
                of                                 += localParticle.torque[0];
                outputTorque[offset]                 = of;
    
                of                                  = outputTorque[offset+1];
                of                                 += localParticle.torque[1];
                outputTorque[offset+1]               = of;
    
                of                                  = outputTorque[offset+2];
                of                                 += localParticle.torque[2];
                outputTorque[offset+2]               = of;
    
                offset                              = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
    
                of                                  = outputForce[offset];
                of                                 += sA[threadIdx.x].force[0];
                outputForce[offset]                 = of;
    
                of                                  = outputForce[offset+1];
                of                                 += sA[threadIdx.x].force[1];
                outputForce[offset+1]               = of;
    
                of                                  = outputForce[offset+2];
                of                                 += sA[threadIdx.x].force[2];
                outputForce[offset+2]               = of;
    
                of                                  = outputTorque[offset];
                of                                 += sA[threadIdx.x].torque[0];
                outputTorque[offset]                = of;
    
                of                                  = outputTorque[offset+1];
                of                                 += sA[threadIdx.x].torque[1];
                outputTorque[offset+1]              = of;
    
                of                                  = outputTorque[offset+2];
                of                                 += sA[threadIdx.x].torque[2];
                outputTorque[offset+2]              = of;
    
    #else
                unsigned int offset                 = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
    
                outputForce[offset]                 = localParticle.force[0];
                outputForce[offset+1]               = localParticle.force[1];
                outputForce[offset+2]               = localParticle.force[2];
    
                outputTorque[offset]                = localParticle.torque[0];
                outputTorque[offset+1]              = localParticle.torque[1];
                outputTorque[offset+2]              = localParticle.torque[2];
    
                offset                              = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
    
                outputForce[offset]                 = sA[threadIdx.x].force[0];
                outputForce[offset+1]               = sA[threadIdx.x].force[1];
                outputForce[offset+2]               = sA[threadIdx.x].force[2];
    
                outputTorque[offset]                = sA[threadIdx.x].torque[0];
                outputTorque[offset+1]              = sA[threadIdx.x].torque[1];
                outputTorque[offset+2]              = sA[threadIdx.x].torque[2];
    
    #endif
                lasty = y;
473

Mark Friedrichs's avatar
Mark Friedrichs committed
474
            } // end of pInteractionFlag block
475
476
477
478
479
        }
        pos++;
    }
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
}