kCalculateAmoebaCudaPmeDirectElectrostatic.h 19.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Scott Le Grand, Peter Eastman                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#include "amoebaScaleFactors.h"

__global__ 
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(384, 1)
32
#elif (__CUDA_ARCH__ >= 120)
33
34
35
36
__launch_bounds__(128, 1)
#else
__launch_bounds__(64, 1)
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
37
void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
38
                            unsigned int* workUnit, float* outputTorque
39
40
41
42
43
44

#ifdef AMOEBA_DEBUG
                           , float4* debugArray, unsigned int targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
45
    int maxPullIndex = 7;
46
    float4 pullBack[12];
47
48
#endif

Mark Friedrichs's avatar
Mark Friedrichs committed
49
    extern __shared__ PmeDirectElectrostaticParticle sA[];
50
51
52
53
54
55
56
57

    unsigned int totalWarps      = gridDim.x*blockDim.x/GRID;
    unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
    unsigned int numWorkUnits    = cSim.pInteractionCount[0];
    unsigned int pos             = warp*numWorkUnits/totalWarps;
    unsigned int end             = (warp+1)*numWorkUnits/totalWarps;
    unsigned int lasty           = 0xFFFFFFFF;
    float totalEnergy            = 0.0f;     
Mark Friedrichs's avatar
Mark Friedrichs committed
58
    float4 forceTorqueEnergy[3];
59
60
61
62
63
64
65
66
67

    float scalingFactors[LastScalingIndex];

    while (pos < end)
    {

        unsigned int x;
        unsigned int y;
        bool bExclusionFlag;
Mark Friedrichs's avatar
Mark Friedrichs committed
68
69
70
        int  dScaleMask;
        int2 pScaleMask;
        int2 mScaleMask;
71
72
73
74
75

        // Extract cell coordinates

        decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );

Mark Friedrichs's avatar
Mark Friedrichs committed
76
77
78
        unsigned int tgx                       = threadIdx.x & (GRID - 1);
        unsigned int tbx                       = threadIdx.x - tgx;
        unsigned int tj                        = tgx;
79

Mark Friedrichs's avatar
Mark Friedrichs committed
80
        PmeDirectElectrostaticParticle* psA    = &sA[tbx];
Mark Friedrichs's avatar
Mark Friedrichs committed
81
        unsigned int atomI                     = x + tgx;
Mark Friedrichs's avatar
Mark Friedrichs committed
82
83
        PmeDirectElectrostaticParticle localParticle;
        loadPmeDirectElectrostaticShared(&localParticle, atomI );
84

Mark Friedrichs's avatar
Mark Friedrichs committed
85
86
87
        localParticle.force[0]                 = 0.0f;
        localParticle.force[1]                 = 0.0f;
        localParticle.force[2]                 = 0.0f;
88

Mark Friedrichs's avatar
Mark Friedrichs committed
89
90
91
        localParticle.torque[0]                = 0.0f;
        localParticle.torque[1]                = 0.0f;
        localParticle.torque[2]                = 0.0f;
92

Mark Friedrichs's avatar
Mark Friedrichs committed
93
        scalingFactors[UScaleIndex]            = 1.0f;
94
95
96
97
98
99

        if (x == y) // Handle diagonals uniquely at 50% efficiency
        {

            // load shared data

Mark Friedrichs's avatar
Mark Friedrichs committed
100
            loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), atomI );
101

Mark Friedrichs's avatar
Mark Friedrichs committed
102
            if (bExclusionFlag)
103
104
            {
                unsigned int xi       = x >> GRIDBITS;
105
                unsigned int cell     = xi + xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
Mark Friedrichs's avatar
Mark Friedrichs committed
106
107
108
109
110
111
                dScaleMask            = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                pScaleMask            = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                mScaleMask            = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
            } else {
                scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
            }
112

Mark Friedrichs's avatar
Mark Friedrichs committed
113
114
            for (unsigned int j = 0; j < GRID; j++)
            {
115

Mark Friedrichs's avatar
Mark Friedrichs committed
116
                unsigned int atomJ = y + j;
117

Mark Friedrichs's avatar
Mark Friedrichs committed
118
                // set scale factors
119

Mark Friedrichs's avatar
Mark Friedrichs committed
120
121
                if (bExclusionFlag)
                {
122
123
124
                    getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex );
                    getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex );
                    getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex );
Mark Friedrichs's avatar
Mark Friedrichs committed
125
                }
126

Mark Friedrichs's avatar
Mark Friedrichs committed
127
                // force
128

Mark Friedrichs's avatar
Mark Friedrichs committed
129
                calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[j], scalingFactors, forceTorqueEnergy
130
131
132
133
134
#ifdef AMOEBA_DEBUG
, pullBack
#endif
 );

Mark Friedrichs's avatar
Mark Friedrichs committed
135
136
                // nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
                // by setting match flag
137

138
                if( (atomI != atomJ) && (atomI < cSim.atoms) && (atomJ < cSim.atoms) )
Mark Friedrichs's avatar
Mark Friedrichs committed
139
140
141
142
143
144
145
146
147
148
                {
                    localParticle.force[0]      += forceTorqueEnergy[0].x;
                    localParticle.force[1]      += forceTorqueEnergy[0].y;
                    localParticle.force[2]      += forceTorqueEnergy[0].z;
    
                    localParticle.torque[0]     += forceTorqueEnergy[1].x;
                    localParticle.torque[1]     += forceTorqueEnergy[1].y;
                    localParticle.torque[2]     += forceTorqueEnergy[1].z;
    
                    // energy for each diagonal-block ixn included twice, hence factor of 0.5
149

Mark Friedrichs's avatar
Mark Friedrichs committed
150
151
                    totalEnergy                 += 0.5*forceTorqueEnergy[0].w;
                }
152
153

#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
154
if( atomI == targetAtom || atomJ == targetAtom ){
155

156
    unsigned int mask       =  ( (atomI == atomJ) || (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
Mark Friedrichs's avatar
Mark Friedrichs committed
157
158
159
160
161
162
163
164
    unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
    float blockId                      = 1.0f;

    debugArray[index].x                = (float) atomI;
    debugArray[index].y                = (float) atomJ;
    debugArray[index].z                = (float) y;
    debugArray[index].w                = blockId;

165
    index                             += cSim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
166
167
168
169
    debugArray[index].x                = mask ? forceTorqueEnergy[0].x  : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[0].y  : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[0].z  : 0.0f;
    debugArray[index].w                = mask ? forceTorqueEnergy[0].w  : 0.0f;
Mark Friedrichs's avatar
Mark Friedrichs committed
170
171


172
    index                             += cSim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
173
174
175
    debugArray[index].x                = mask ? forceTorqueEnergy[1].x : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[1].y : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[1].z : 0.0f;
176
    float offsetF                      = (float)(3*(x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms));
Mark Friedrichs's avatar
Mark Friedrichs committed
177
    debugArray[index].w                = offsetF;
Mark Friedrichs's avatar
Mark Friedrichs committed
178

179
    index                             += cSim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
180
181
182
183
    debugArray[index].x                = mask ? forceTorqueEnergy[2].x : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[2].y : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[2].z : 0.0f;
    debugArray[index].w                = offsetF;
Mark Friedrichs's avatar
Mark Friedrichs committed
184

185
    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
186
        index                             += cSim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
187
188
189
190
191
        debugArray[index].x                = pullBack[pullIndex].x;
        debugArray[index].y                = pullBack[pullIndex].y;
        debugArray[index].z                = pullBack[pullIndex].z;
        debugArray[index].w                = pullBack[pullIndex].w;
    }
192
193
194
}
#endif

Mark Friedrichs's avatar
Mark Friedrichs committed
195
            } // end of j-loop
196

Mark Friedrichs's avatar
Mark Friedrichs committed
197
198
            // include self energy and self torque

199
            if( atomI < cSim.atoms ){
Mark Friedrichs's avatar
Mark Friedrichs committed
200
                calculatePmeSelfTorqueElectrostaticPairIxn_kernel( localParticle );
Mark Friedrichs's avatar
Mark Friedrichs committed
201
202
203
                float energy;
                calculatePmeSelfEnergyElectrostaticPairIxn_kernel( localParticle, &energy );
                totalEnergy += energy;
Mark Friedrichs's avatar
Mark Friedrichs committed
204
205
            }

206
207
208
            // Write results

#ifdef USE_OUTPUT_BUFFER_PER_WARP
209
210
211
            unsigned int offset                 = (x + tgx + warp*cSim.paddedNumberOfAtoms);
            add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
            add3dArray( 3*offset, localParticle.torque, outputTorque );
212
#else
213
214
215
            unsigned int offset                 = (x + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
            add3dArrayToFloat4( offset, localParticle.force, cSim.pForce4 );
            load3dArray( 3*offset, localParticle.torque, outputTorque );
216
217
#endif

Mark Friedrichs's avatar
Mark Friedrichs committed
218
        } else {
219

Mark Friedrichs's avatar
Mark Friedrichs committed
220
221
222
223
224
225
226
227
            if (lasty != y) {

                // load shared data

               loadPmeDirectElectrostaticShared( &(sA[threadIdx.x]), (y+tgx) );

            }

Mark Friedrichs's avatar
Mark Friedrichs committed
228
229
230
231
            unsigned int flags           = cSim.pInteractionFlag[pos];
            if (flags == 0) {
                // No interactions in this block.
            } else {
232

Mark Friedrichs's avatar
Mark Friedrichs committed
233
234
235
236
237
238
239
240
241
242
243
244
                sA[threadIdx.x].force[0]     = 0.0f;
                sA[threadIdx.x].force[1]     = 0.0f;
                sA[threadIdx.x].force[2]     = 0.0f;
    
                sA[threadIdx.x].torque[0]    = 0.0f;
                sA[threadIdx.x].torque[1]    = 0.0f;
                sA[threadIdx.x].torque[2]    = 0.0f;
    
                if( bExclusionFlag )
                {
                    unsigned int xi   = x >> GRIDBITS;
                    unsigned int yi   = y >> GRIDBITS;
245
                    unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
Mark Friedrichs's avatar
Mark Friedrichs committed
246
247
248
249
250
251
252
                    dScaleMask        = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                    pScaleMask        = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                    mScaleMask        = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
                } else {
                    scalingFactors[DScaleIndex] = scalingFactors[PScaleIndex] = scalingFactors[MScaleIndex] = 1.0f;
                }
       
253
254
                for (unsigned int j = 0; j < GRID; j++)
                {
Mark Friedrichs's avatar
Mark Friedrichs committed
255
                    if( (flags & (1<<j) ) != 0)
Mark Friedrichs's avatar
Mark Friedrichs committed
256
                    {
Peter Eastman's avatar
Peter Eastman committed
257
258
259
260
261
262
263
264
265
266
267
268
269
270
                        unsigned int jIdx  = (flags == 0xFFFFFFFF) ? tj : j;
                        unsigned int atomJ = y + jIdx;

                        // set scale factors

                        if( bExclusionFlag )
                        {
                            getMaskedDScaleFactor( jIdx, dScaleMask, scalingFactors + DScaleIndex );
                            getMaskedPScaleFactor( jIdx, pScaleMask, scalingFactors + PScaleIndex );
                            getMaskedMScaleFactor( jIdx, mScaleMask, scalingFactors + MScaleIndex );
                        }

                        // force

Mark Friedrichs's avatar
Mark Friedrichs committed
271
272
                        calculatePmeDirectElectrostaticPairIxn_kernel( localParticle, psA[jIdx],
                                                                       scalingFactors, forceTorqueEnergy
273
#ifdef AMOEBA_DEBUG
Mark Friedrichs's avatar
Mark Friedrichs committed
274
    , pullBack
275
#endif
Peter Eastman's avatar
Peter Eastman committed
276
         );
277

Peter Eastman's avatar
Peter Eastman committed
278
                        // check if atoms out-of-bounds
279

280
                        if( (atomI < cSim.atoms) && (atomJ < cSim.atoms) )
Mark Friedrichs's avatar
Mark Friedrichs committed
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
                        {
                            // add force and torque to atom I due atom J
    
                            localParticle.force[0]         += forceTorqueEnergy[0].x;
                            localParticle.force[1]         += forceTorqueEnergy[0].y;
                            localParticle.force[2]         += forceTorqueEnergy[0].z;
    
                            totalEnergy                    += forceTorqueEnergy[0].w;
    
                            localParticle.torque[0]        += forceTorqueEnergy[1].x;
                            localParticle.torque[1]        += forceTorqueEnergy[1].y;
                            localParticle.torque[2]        += forceTorqueEnergy[1].z;
    
                            // add force and torque to atom J due atom I
    
                            if( flags == 0xFFFFFFFF ){
    
                                psA[jIdx].force[0]         -= forceTorqueEnergy[0].x;
                                psA[jIdx].force[1]         -= forceTorqueEnergy[0].y;
                                psA[jIdx].force[2]         -= forceTorqueEnergy[0].z;
    
                                psA[jIdx].torque[0]        += forceTorqueEnergy[2].x;
                                psA[jIdx].torque[1]        += forceTorqueEnergy[2].y;
                                psA[jIdx].torque[2]        += forceTorqueEnergy[2].z;
    
                            } else {
    
                                sA[threadIdx.x].tempForce[0]  = forceTorqueEnergy[0].x;
Mark Friedrichs's avatar
Mark Friedrichs committed
309
310
                                sA[threadIdx.x].tempForce[1]  = forceTorqueEnergy[0].y;
                                sA[threadIdx.x].tempForce[2]  = forceTorqueEnergy[0].z;
Mark Friedrichs's avatar
Mark Friedrichs committed
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
    
                                sA[threadIdx.x].tempTorque[0] = forceTorqueEnergy[2].x;
                                sA[threadIdx.x].tempTorque[1] = forceTorqueEnergy[2].y;
                                sA[threadIdx.x].tempTorque[2] = forceTorqueEnergy[2].z;
    
                                if( tgx % 2 == 0 ){
                                    sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
                                }
                                if( tgx % 4 == 0 ){
                                    sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
                                }
                                if( tgx % 8 == 0 ){
                                    sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
                                }
                                if( tgx % 16 == 0 ){
                                    sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
                                }
    
                                if (tgx == 0)
                                {
                                    psA[jIdx].force[0]  -= sA[threadIdx.x].tempForce[0]  + sA[threadIdx.x+16].tempForce[0];
                                    psA[jIdx].force[1]  -= sA[threadIdx.x].tempForce[1]  + sA[threadIdx.x+16].tempForce[1];
                                    psA[jIdx].force[2]  -= sA[threadIdx.x].tempForce[2]  + sA[threadIdx.x+16].tempForce[2];
    
                                    psA[jIdx].torque[0] += sA[threadIdx.x].tempTorque[0] + sA[threadIdx.x+16].tempTorque[0];
                                    psA[jIdx].torque[1] += sA[threadIdx.x].tempTorque[1] + sA[threadIdx.x+16].tempTorque[1];
                                    psA[jIdx].torque[2] += sA[threadIdx.x].tempTorque[2] + sA[threadIdx.x+16].tempTorque[2];
                                }
                            }
                        } // end of atoms out-of-bounds
                    } // end of flags&(1<<j block
 
#ifdef AMOEBA_DEBUG
unsigned int jIdx  = (flags == 0xFFFFFFFF) ? tj : j;
unsigned int atomJ = y + jIdx;
346
unsigned int mask  =  ( (atomI >= cSim.atoms) || (atomJ >= cSim.atoms) ) ? 0 : 1;
Mark Friedrichs's avatar
Mark Friedrichs committed
347
348
if( atomI == targetAtom || atomJ == targetAtom ){
    unsigned int index                 = (atomI == targetAtom) ? atomJ : atomI;
Peter Eastman's avatar
Peter Eastman committed
349

Mark Friedrichs's avatar
Mark Friedrichs committed
350
351
352
353
    debugArray[index].x                = (float) atomI;
    debugArray[index].y                = (float) atomJ;
    debugArray[index].z                = (float) y;
    debugArray[index].w                = (flags == 0xFFFFFFFF) ? (float) -141.0f : -151.0f;
Peter Eastman's avatar
Peter Eastman committed
354

355
    index                             += cSim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
356
357
358
359
    debugArray[index].x                = mask ? forceTorqueEnergy[0].x  : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[0].y  : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[0].z  : 0.0f;
    debugArray[index].w                = mask ? forceTorqueEnergy[0].w  : 0.0f;
Peter Eastman's avatar
Peter Eastman committed
360
361


362
    index                             += cSim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
363
364
365
    debugArray[index].x                = mask ? forceTorqueEnergy[1].x : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[1].y : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[1].z : 0.0f;
366
    float offsetF                      = (float)(3*(y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms));
Mark Friedrichs's avatar
Mark Friedrichs committed
367
    debugArray[index].w                = offsetF;
Peter Eastman's avatar
Peter Eastman committed
368

369
    index                             += cSim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
370
371
372
    debugArray[index].x                = mask ? forceTorqueEnergy[2].x : 0.0f;
    debugArray[index].y                = mask ? forceTorqueEnergy[2].y : 0.0f;
    debugArray[index].z                = mask ? forceTorqueEnergy[2].z : 0.0f;
373
    offsetF                            = (float) (3*(x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms));
Mark Friedrichs's avatar
Mark Friedrichs committed
374
    debugArray[index].w                = offsetF;
Mark Friedrichs's avatar
Mark Friedrichs committed
375

Mark Friedrichs's avatar
Mark Friedrichs committed
376
    for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
377
        index                             += cSim.paddedNumberOfAtoms;
Mark Friedrichs's avatar
Mark Friedrichs committed
378
379
380
381
382
383
384
        debugArray[index].x                = pullBack[pullIndex].x;
        debugArray[index].y                = pullBack[pullIndex].y;
        debugArray[index].z                = pullBack[pullIndex].z;
        debugArray[index].w                = pullBack[pullIndex].w;
    }
}
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
385
                    tj = (tj + 1) & (GRID - 1);
386

Mark Friedrichs's avatar
Mark Friedrichs committed
387
                } // end of j-loop
388

Mark Friedrichs's avatar
Mark Friedrichs committed
389
390
                // Write results
    
Mark Friedrichs's avatar
Mark Friedrichs committed
391
#ifdef USE_OUTPUT_BUFFER_PER_WARP
Mark Friedrichs's avatar
Mark Friedrichs committed
392
    
393
394
395
                unsigned int offset                 = (x + tgx + warp*cSim.paddedNumberOfAtoms);
                add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
                add3dArray(       3*offset, localParticle.torque, outputTorque );
Mark Friedrichs's avatar
Mark Friedrichs committed
396
    
397
398
399
                offset                              = (y + tgx + warp*cSim.paddedNumberOfAtoms);
                add3dArrayToFloat4( offset, sA[threadIdx.x].force,  cSim.pForce4 );
                add3dArray(       3*offset, sA[threadIdx.x].torque, outputTorque );
Mark Friedrichs's avatar
Mark Friedrichs committed
400
    
Mark Friedrichs's avatar
Mark Friedrichs committed
401
#else
402
403
404
                unsigned int offset                 = (x + tgx + (y >> GRIDBITS) * cSim.paddedNumberOfAtoms);
                add3dArrayToFloat4( offset, localParticle.force,  cSim.pForce4 );
                load3dArray(       3*offset, localParticle.torque, outputTorque );
Mark Friedrichs's avatar
Mark Friedrichs committed
405
    
406
                offset                              = (y + tgx + (x >> GRIDBITS) * cSim.paddedNumberOfAtoms);
Mark Friedrichs's avatar
Mark Friedrichs committed
407
    
408
409
                add3dArrayToFloat4( offset, sA[threadIdx.x].force,  cSim.pForce4 );
                load3dArray(       3*offset, sA[threadIdx.x].torque, outputTorque );
Mark Friedrichs's avatar
Mark Friedrichs committed
410
    
Mark Friedrichs's avatar
Mark Friedrichs committed
411
#endif
Mark Friedrichs's avatar
Mark Friedrichs committed
412
                lasty = y;
413

Mark Friedrichs's avatar
Mark Friedrichs committed
414
            } // end of pInteractionFlag block
415
416
417
418
419
        }
        pos++;
    }
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
}