customNonbondedGroups.cl 5.21 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif

#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)

typedef struct {
    real x, y, z;
    real q;
    real fx, fy, fz;
    ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
    real padding;
#endif
} AtomData;

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
/**
 * This function is used on devices that don't support 64 bit atomics.  Multiple threads within
 * a single tile might have computed forces on the same atom.  This loops over them and makes sure
 * that only one thread updates the force on any given atom.
 */
void writeForces(__global real4* forceBuffers,__local AtomData* localData, int atomIndex) {
    localData[get_local_id(0)].x = atomIndex;
    SYNC_WARPS;
    real4 forceSum = (real4) 0;
    int start = (get_local_id(0)/TILE_SIZE)*TILE_SIZE;
    int end = start+32;
    bool isFirst = true;
    for (int i = start; i < end; i++)
        if (localData[i].x == atomIndex) {
            forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
            isFirst &= (i >= get_local_id(0));
        }
    const unsigned int warp = get_global_id(0)/TILE_SIZE;
    unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
    if (isFirst)
        forceBuffers[offset] += forceSum;
    SYNC_WARPS;
}

41
__kernel void computeInteractionGroups(
42
43
44
45
46
47
#ifdef SUPPORTS_64_BIT_ATOMICS
        __global long* restrict forceBuffers,
#else
        __global real4* restrict forceBuffers,
#endif
        __global real* restrict energyBuffer, __global const real4* restrict posq,
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
        __global const int4* restrict groupData, real4 periodicBoxSize, real4 invPeriodicBoxSize
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
    const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = get_local_id(0) - tgx;           // block warpIndex
    real energy = 0.0f;
    __local AtomData localData[THREAD_BLOCK_SIZE];

    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
    const unsigned int endTile = FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps;
    for (int tile = startTile; tile < endTile; tile++) {
        const int4 atomData = groupData[TILE_SIZE*tile+tgx];
        const int atom1 = atomData.x;
        const int atom2 = atomData.y;
        const int rangeStart = atomData.z&0xFFFF;
        const int rangeEnd = (atomData.z>>16)&0xFFFF;
        const int exclusions = atomData.w;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
        real4 force = (real4) (0);
        real4 posq2 = posq[atom2];
        localData[get_local_id(0)].x = posq2.x;
        localData[get_local_id(0)].y = posq2.y;
        localData[get_local_id(0)].z = posq2.z;
        localData[get_local_id(0)].q = posq2.w;
        LOAD_LOCAL_PARAMETERS
        localData[get_local_id(0)].fx = 0.0f;
        localData[get_local_id(0)].fy = 0.0f;
        localData[get_local_id(0)].fz = 0.0f;
        int tj = tgx;
        SYNC_WARPS;
        for (int j = rangeStart; j < rangeEnd; j++) {
            bool isExcluded = (((exclusions>>tj)&1) == 0);
            int localIndex = tbx+tj;
            posq2 = (real4) (localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
            real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
            delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
            real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
            if (!isExcluded && r2 < CUTOFF_SQUARED) {
#endif
                real invR = RSQRT(r2);
                real r = RECIP(invR);
                LOAD_ATOM2_PARAMETERS
                real dEdR = 0.0f;
                real tempEnergy = 0.0f;
                COMPUTE_INTERACTION
                energy += tempEnergy;
                delta *= dEdR;
                force.xyz -= delta.xyz;
                localData[localIndex].fx += delta.x;
                localData[localIndex].fy += delta.y;
                localData[localIndex].fz += delta.z;
#ifdef USE_CUTOFF
            }
#endif
            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
            SYNC_WARPS;
        }
110
#ifdef SUPPORTS_64_BIT_ATOMICS
111
112
113
114
115
116
117
118
        if (exclusions != 0) {
            atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
            atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
            atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
            atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000));
            atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
            atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
        }
119
120
121
122
123
124
125
#else
        writeForces(forceBuffers, localData, atom2);
        localData[get_local_id(0)].fx = force.x;
        localData[get_local_id(0)].fy = force.y;
        localData[get_local_id(0)].fz = force.z;
        writeForces(forceBuffers, localData, atom1);
#endif
126
127
128
    }
    energyBuffer[get_global_id(0)] += energy;
}